PyPI - nexaai - Versions diffs - 1.0.17rc10__cp310-cp310-macosx_13_0_x86_64.whl → 1.0.18__cp310-cp310-macosx_13_0_x86_64.whl - Mend

nexaai 1.0.17rc10__cp310-cp310-macosx_13_0_x86_64.whl → 1.0.18__cp310-cp310-macosx_13_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nexaai might be problematic. Click here for more details.

Files changed (9) hide show

nexaai/_stub.cpython-310-darwin.so CHANGED Viewed

Binary file

nexaai/_version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # This file is generated by CMake from _version.py.in
 # Do not modify this file manually - it will be overwritten
-__version__ = "1.0.17-rc10"
+__version__ = "1.0.18"

nexaai/binds/libnexa_bridge.dylib CHANGED Viewed

Binary file

nexaai/mlx_backend/vlm/generate_qwen3_vl.py CHANGED Viewed

@@ -41,7 +41,12 @@ except ImportError:
 from ml import ChatMessage
 from dataclasses import dataclass
 from typing import Any, Generator, List, Optional, Sequence, Tuple, Union
-from .generate import GenerationResult
+from .generate import GenerationResult
+# Custom exception for context length exceeded
+class ContextLengthExceededError(Exception):
+    """Raised when input context length exceeds model's maximum context size"""
+    pass
 @dataclass
 class Qwen3VLBundledModel:
@@ -67,6 +72,7 @@ def load_qwen3_vl(
     Parameters are aligned with .generate.load for compatibility.
     """
     model_path = Path(path_or_repo)
     if not model_path.exists():
         if "/" in path_or_repo:
@@ -154,7 +160,6 @@ def load_qwen3_vl(
     if quantization_bits in [4, 8]:
         nn.quantize(llm_model, bits=quantization_bits, group_size=64,
                     class_predicate=quant_predicate)
-    # For f32 (32-bit), no quantization needed
     llm_model.load_weights(str(llm_weights_path), strict=True)
@@ -166,11 +171,15 @@ def load_qwen3_vl(
 def apply_chat_template_qwen3_vl(messages: Sequence[ChatMessage], num_images: int = 0, num_audios: int = 0, tools: Optional[str] = None, enable_thinking: bool = False) -> str:
     """Apply chat template: serialize messages with content as a list of typed items."""
     messages_dict = []
-    for msg in messages:
+    for i, msg in enumerate(messages):
         content_items = [{"type": "text", "text": msg.content}]
         messages_dict.append({"role": msg.role, "content": content_items})
-    return json.dumps(messages_dict)
+    result = json.dumps(messages_dict)
+    return result
 def stream_generate_qwen3_vl(
@@ -184,15 +193,22 @@ def stream_generate_qwen3_vl(
 ) -> Generator[Any, None, None]:
     """Stream generation yielding .generate.GenerationResult-compatible chunks."""
-    messages = json.loads(prompt)
+    try:
+        messages = json.loads(prompt)
+    except json.JSONDecodeError as e:
+        raise
     if image is not None:
         image_list = image if isinstance(image, list) else [image]
         pil_images = []
-        for p in image_list:
+        for i, p in enumerate(image_list):
             try:
-                pil_images.append(Image.open(p))
-            except Exception:
+                img = Image.open(p)
+                pil_images.append(img)
+            except Exception as e:
                 continue
         contents = [{"type": "image", "image": img} for img in pil_images]
         if messages:
             if "content" not in messages[-1] or not isinstance(messages[-1]["content"], list):
@@ -201,6 +217,7 @@ def stream_generate_qwen3_vl(
     raw_text, processed_images = processor.messages_to_text(
         messages, add_generation_prompt=True)
     inputs = processor.text_to_input_ids(
         raw_text, images=processed_images, return_tensors="mlx")
@@ -208,10 +225,18 @@ def stream_generate_qwen3_vl(
     input_ids = inputs["input_ids"]
     pixel_values = inputs.get("pixel_values")
     image_grid_thw = inputs.get("image_grid_thw")
+    # Check if input context exceeds KV cache size and raise error
+    max_kv_size = 4096  # This should match the max_kv_size used in make_prompt_cache and nexa_generate_step
+    if input_ids.size > max_kv_size:
+        error_msg = f"Input context length ({input_ids.size} tokens) exceeds maximum supported context size ({max_kv_size} tokens). Please reduce the input length."
+        raise ContextLengthExceededError(error_msg)
     inputs_embeds, deepstack_visual_embeds, visual_pos_masks, cos, sin, rope_deltas = handle_multimodal_embeds(
         model.vision_model, model.llm_model, input_ids, pixel_values, image_grid_thw
     )
     prompt_cache = make_prompt_cache(model.llm_model, max_kv_size=4096)
     tokenizer = processor.tokenizer
@@ -222,37 +247,45 @@ def stream_generate_qwen3_vl(
     gen_count = 0
     tic = time.perf_counter()
-    for token, logprobs in nexa_generate_step(
-        model=model.llm_model,
-        prompt=None,
-        input_embeddings=inputs_embeds,
-        max_tokens=max_tokens,
-        max_kv_size=4096,
-        prompt_cache=prompt_cache,
-        visual_pos_masks=visual_pos_masks,
-        deepstack_visual_embeds=deepstack_visual_embeds,
-        cos=cos,
-        sin=sin,
-        rope_deltas=rope_deltas,
-    ):
-        if token == tokenizer.eos_token_id:
-            break
-        text_piece = tokenizer.decode([token])
-        gen_count += 1
-        yield GenerationResult(
-            text=text_piece,
-            token=token,
-            logprobs=logprobs,
-            prompt_tokens=int(input_ids.size),
-            generation_tokens=gen_count,
-            prompt_tps=float(prompt_tps),
-            generation_tps=float(
-                gen_count / max(1e-6, (time.perf_counter() - tic))),
-            peak_memory=float(mx.get_peak_memory() / 1e9),
-        )
+    try:
+        for token, logprobs in nexa_generate_step(
+            model=model.llm_model,
+            prompt=None,
+            input_embeddings=inputs_embeds,
+            max_tokens=max_tokens,
+            max_kv_size=4096,
+            prompt_cache=prompt_cache,
+            visual_pos_masks=visual_pos_masks,
+            deepstack_visual_embeds=deepstack_visual_embeds,
+            cos=cos,
+            sin=sin,
+            rope_deltas=rope_deltas,
+        ):
+            if token == tokenizer.eos_token_id:
+                break
+            text_piece = tokenizer.decode([token])
+            gen_count += 1
+            current_tps = gen_count / max(1e-6, (time.perf_counter() - tic))
+            yield GenerationResult(
+                text=text_piece,
+                token=token,
+                logprobs=logprobs,
+                prompt_tokens=int(input_ids.size),
+                generation_tokens=gen_count,
+                prompt_tps=float(prompt_tps),
+                generation_tps=float(current_tps),
+                peak_memory=float(mx.get_peak_memory() / 1e9),
+            )
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        raise
 def quant_predicate(path: str, mod: nn.Module) -> bool:
     """Quantization predicate to exclude certain layers from quantization."""

nexaai/mlx_backend/vlm/interface.py CHANGED Viewed

@@ -25,7 +25,7 @@ from profiling import ProfilingMixin, ProfilingData, StopReason
 # Import from the actual mlx_vlm structure
 from .generate import generate, stream_generate, load
-from .generate_qwen3_vl import apply_chat_template_qwen3_vl, stream_generate_qwen3_vl, load_qwen3_vl
+from .generate_qwen3_vl import apply_chat_template_qwen3_vl, stream_generate_qwen3_vl, load_qwen3_vl, ContextLengthExceededError
 from .modeling.prompt_utils import apply_chat_template
@@ -124,7 +124,7 @@ class VLM(ProfilingMixin):
         prompt: str,
         config: Optional[GenerationConfig] = None,
     ) -> GenerationResult:
-        """Generate text from prompt."""
+        """Generate text from prompt."""
         # Start profiling
         self._start_profiling()
@@ -148,12 +148,15 @@ class VLM(ProfilingMixin):
         # Extract incremental portion of the prompt (similar to llama.cpp VLM)
         full_prompt_len = len(prompt)
         incremental_prompt = prompt
-        if self.global_n_past_chars < full_prompt_len:
-            incremental_prompt = prompt[self.global_n_past_chars:]
-        else:
-            # No new text to process
-            incremental_prompt = ""
+        # Apply incremental processing only for non-qwen3vl models
+        # qwen3vl requires complete JSON conversation structure
+        if self.model_name != "qwen3vl":
+            if self.global_n_past_chars < full_prompt_len:
+                incremental_prompt = prompt[self.global_n_past_chars:]
+            else:
+                # No new text to process
+                incremental_prompt = ""
         # End prompt processing, start decode
         self._prompt_end()
@@ -196,13 +199,15 @@ class VLM(ProfilingMixin):
             self._update_generated_tokens(generated_tokens)
             self._set_stop_reason(StopReason.ML_STOP_REASON_COMPLETED)
-            # Update global character position
-            self.global_n_past_chars = full_prompt_len + len(text)
+            # Update global character position (not needed for qwen3vl JSON processing)
+            if self.model_name != "qwen3vl":
+                old_pos = self.global_n_past_chars
+                self.global_n_past_chars = full_prompt_len + len(text)
             self._decode_end()
             self._end_profiling()
-            return GenerationResult(
+            result = GenerationResult(
                 text=text,
                 prompt_tokens=prompt_tokens,
                 generation_tokens=generated_tokens,
@@ -211,7 +216,18 @@ class VLM(ProfilingMixin):
                 generation_tps=stats.get("generation_tps", 0.0),
                 peak_memory=stats.get("peak_memory", 0.0),
             )
+            return result
+        except ContextLengthExceededError as e:
+            self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
+            self._decode_end()
+            self._end_profiling()
+            # Re-raise the original exception without wrapping it
+            raise e
         except Exception as e:
+            import traceback
+            traceback.print_exc()
             self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
             self._decode_end()
             self._end_profiling()
@@ -224,6 +240,7 @@ class VLM(ProfilingMixin):
         on_token: Optional[TokenCallback],
     ) -> GenerationResult:
         """Generate text with streaming callback. Unified method for both text and multimodal generation."""
         # Start profiling
         self._start_profiling()
@@ -236,6 +253,7 @@ class VLM(ProfilingMixin):
         if self.sampler_config is not None:
             gen_kwargs.update(self.sampler_config.__dict__)
         # Get image and audio paths from config
         image_paths = config.image_paths if config else None
         audio_paths = config.audio_paths if config else None
@@ -244,15 +262,20 @@ class VLM(ProfilingMixin):
         image_list = [str(path) for path in image_paths] if image_paths else None
         audio_list = [str(path) for path in audio_paths] if audio_paths else None
         # Extract incremental portion of the prompt (similar to llama.cpp VLM)
         full_prompt_len = len(prompt)
         incremental_prompt = prompt
-        if self.global_n_past_chars < full_prompt_len:
-            incremental_prompt = prompt[self.global_n_past_chars:]
-        else:
-            # No new text to process
-            incremental_prompt = ""
+        # Apply incremental processing only for non-qwen3vl models
+        # qwen3vl requires complete JSON conversation structure
+        if self.model_name != "qwen3vl":
+            if self.global_n_past_chars < full_prompt_len:
+                incremental_prompt = prompt[self.global_n_past_chars:]
+            else:
+                # No new text to process
+                incremental_prompt = ""
         # End prompt processing, start decode
         self._prompt_end()
@@ -264,6 +287,8 @@ class VLM(ProfilingMixin):
         stream_generate_impl = stream_generate_qwen3_vl if self.model_name == "qwen3vl" else stream_generate
         try:
+            token_count = 0
             for result in stream_generate_impl(
                 self.model,
                 self.processor,
@@ -272,7 +297,9 @@ class VLM(ProfilingMixin):
                 audio=audio_list,
                 **gen_kwargs,
             ):
-                                # Record TTFT on first token
+                token_count += 1
+                # Record TTFT on first token
                 if first_token:
                     self._record_ttft()
                     first_token = False
@@ -285,6 +312,7 @@ class VLM(ProfilingMixin):
                 text += result.text
                 last_result = result
             # Set stop reason if not user stop
             if self._profiling_context.stop_reason != StopReason.ML_STOP_REASON_USER:
                 self._set_stop_reason(StopReason.ML_STOP_REASON_EOS)
@@ -294,13 +322,15 @@ class VLM(ProfilingMixin):
                 self._update_prompt_tokens(last_result.prompt_tokens)
                 self._update_generated_tokens(last_result.generation_tokens)
-            # Update global character position
-            self.global_n_past_chars = full_prompt_len + len(text)
+            # Update global character position (not needed for qwen3vl JSON processing)
+            if self.model_name != "qwen3vl":
+                old_pos = self.global_n_past_chars
+                self.global_n_past_chars = full_prompt_len + len(text)
             self._decode_end()
             self._end_profiling()
-            return GenerationResult(
+            result = GenerationResult(
                 text=text,
                 token=last_result.token if last_result else None,
                 logprobs=last_result.logprobs if last_result else None,
@@ -311,7 +341,18 @@ class VLM(ProfilingMixin):
                 generation_tps=last_result.generation_tps if last_result else 0.0,
                 peak_memory=last_result.peak_memory if last_result else 0.0,
             )
+            return result
+        except ContextLengthExceededError as e:
+            self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
+            self._decode_end()
+            self._end_profiling()
+            # Re-raise the original exception without wrapping it
+            raise e
         except Exception as e:
+            import traceback
+            traceback.print_exc()
             self._set_stop_reason(StopReason.ML_STOP_REASON_UNKNOWN)
             self._decode_end()
             self._end_profiling()

{nexaai-1.0.17rc10.dist-info → nexaai-1.0.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nexaai
-Version: 1.0.17rc10
+Version: 1.0.18
 Summary: Python bindings for NexaSDK C-lib backend
 Author-email: "Nexa AI, Inc." <dev@nexa.ai>
 Project-URL: Homepage, https://github.com/NexaAI/nexasdk-bridge

{nexaai-1.0.17rc10.dist-info → nexaai-1.0.18.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 nexaai/__init__.py,sha256=L8oB7GFZZMGnUpCg0PecDbI_ycKuQak-ZEJ4Y12_QIw,2184
-nexaai/_stub.cpython-310-darwin.so,sha256=9tKb2YBVS2quTKD-OUHqxI2blqCaKef4HLZI6DeZwS4,49832
-nexaai/_version.py,sha256=nKUy6Z6ytRT-zOfWuQqpmzK7vKqdC96nmMqwtCJBwMM,144
+nexaai/_stub.cpython-310-darwin.so,sha256=rme8AeSXOZBPbhUbP9GKQTvYon2BSiwm3T1rVEmo0nA,49832
+nexaai/_version.py,sha256=u4x0epv_LKPUfQvNf5zaekZDwaMD-RVDvPvcAmx-b40,139
 nexaai/asr.py,sha256=NljMXDErwPNMOPaRkJZMEDka9Nk8xyur7L8i924TStY,2054
 nexaai/base.py,sha256=N8PRgDFA-XPku2vWnQIofQ7ipz3pPlO6f8YZGnuhquE,982
 nexaai/common.py,sha256=Y0NJNLTi4Nq4x1WL6PQsSvGUto0eGmWhjpsC6jcekfA,3444
@@ -19,7 +19,7 @@ nexaai/asr_impl/pybind_asr_impl.py,sha256=pE9Hb_hMi5yAc4MF83bLVOb8zDtreCkB3_u7XE
 nexaai/binds/__init__.py,sha256=eYuay_8DDXeOUWz2_R9HFSabohxs6hvZn391t2L0Po0,104
 nexaai/binds/common_bind.cpython-310-darwin.so,sha256=BoXByRlNGDaNS1YyZyCF-s7h0vXP9NLPlJMQQ5pqusU,235488
 nexaai/binds/embedder_bind.cpython-310-darwin.so,sha256=b2NoXFAJvPLi_P1X7lXLKmAUU0v2HJI3Zwa10gfqHdw,202032
-nexaai/binds/libnexa_bridge.dylib,sha256=Yopwbcp5VQ9NF6o9un48Kb5FoqnyIS3QxHNRh8ak_hU,250408
+nexaai/binds/libnexa_bridge.dylib,sha256=59iLj-0ieCv-tU5pcJc7Tj-84pseGPAXL7JOi19bdhc,250408
 nexaai/binds/llm_bind.cpython-310-darwin.so,sha256=p1ZTGMolEkWywkmwzOUjTr3RpSEH21BHZAggVzo89Ks,183088
 nexaai/binds/vlm_bind.cpython-310-darwin.so,sha256=LGd-tykePnQFfGca25HnPIBfXsfrMzbwyx6d5Ld3xps,183000
 nexaai/binds/nexa_llama_cpp/libggml-base.dylib,sha256=GyOkHOM-5uHp7NUZ4Sr9BWak6BYpcc9aqI9A-zPnQp4,629528
@@ -246,8 +246,8 @@ nexaai/mlx_backend/tts/__init__.py,sha256=fuT_9_xpYJ28m4yjly5L2jChUrzlSQz-b_S7nu
 nexaai/mlx_backend/tts/interface.py,sha256=0FvZbIyOvg8jERZEQ6bygbv7v02O9xHO4-TPUlar0b4,9568
 nexaai/mlx_backend/vlm/__init__.py,sha256=_25kvMEviX16Hg3bro8Ws70V0eeIEqYKV8ZDXqYzKew,73
 nexaai/mlx_backend/vlm/generate.py,sha256=DqHFEAuqk-nko8ho6U9GAXTDAWz4d8GTe_hCt-XFyCw,19071
-nexaai/mlx_backend/vlm/generate_qwen3_vl.py,sha256=undjso1mfxqpd6FMTksSA5qagRttxAGbOBj1x7cqI1s,9211
-nexaai/mlx_backend/vlm/interface.py,sha256=0BLfodbYOU71jFvAvv01FuLBE_KBtyB-8Cd7LqzzRHY,17450
+nexaai/mlx_backend/vlm/generate_qwen3_vl.py,sha256=eeizW18u6dHPZOOnJtQUJkiqMAIIpOSS-IOjacXGsz4,10240
+nexaai/mlx_backend/vlm/interface.py,sha256=HOPzWNMs6QaHO6x0Z83kW1xkRRmb8_xo6xQLKsOWqAo,19013
 nexaai/mlx_backend/vlm/main.py,sha256=nPcg25jupeDD74uvRoxpWp3Dsulw7WddI7vll6zejak,10664
 nexaai/mlx_backend/vlm/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nexaai/mlx_backend/vlm/modeling/convert.py,sha256=ia5i9cgTufFGmKyhkYUaW0nfNqT_bMo8i-Hg_zy5JC4,1863
@@ -387,7 +387,7 @@ nexaai/utils/quantization_utils.py,sha256=FYcNSAKGlBqFDUTx3jSKOr2lnq4nyiyC0ZG8oS
 nexaai/vlm_impl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nexaai/vlm_impl/mlx_vlm_impl.py,sha256=pLtWm_ckz8a0U-AtAOMVseFDO4OVPvHyYO2KlfBaGYk,10833
 nexaai/vlm_impl/pybind_vlm_impl.py,sha256=FAbhpRJzHgI78r0mUvKybO97R1szvNhH0aTn_I52oT4,8597
-nexaai-1.0.17rc10.dist-info/METADATA,sha256=CBp42bC2oj1pRu7t-v7qqiH6ZlQ1QLFHSpI3QL1JypU,1202
-nexaai-1.0.17rc10.dist-info/WHEEL,sha256=0KYp5feZ1CMUhsfFXKpSQTbSmQbXy4mv6yPPVBXg2EM,110
-nexaai-1.0.17rc10.dist-info/top_level.txt,sha256=LRE2YERlrZk2vfuygnSzsEeqSknnZbz3Z1MHyNmBU4w,7
-nexaai-1.0.17rc10.dist-info/RECORD,,
+nexaai-1.0.18.dist-info/METADATA,sha256=yh4CFZmHv1dg0aN41La3qaRlXU5XC1_7erEOspSE95s,1198
+nexaai-1.0.18.dist-info/WHEEL,sha256=0KYp5feZ1CMUhsfFXKpSQTbSmQbXy4mv6yPPVBXg2EM,110
+nexaai-1.0.18.dist-info/top_level.txt,sha256=LRE2YERlrZk2vfuygnSzsEeqSknnZbz3Z1MHyNmBU4w,7
+nexaai-1.0.18.dist-info/RECORD,,

{nexaai-1.0.17rc10.dist-info → nexaai-1.0.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{nexaai-1.0.17rc10.dist-info → nexaai-1.0.18.dist-info}/top_level.txt RENAMED Viewed

File without changes