PyPI - nexaai - Versions diffs - 1.0.19rc5__cp310-cp310-macosx_14_0_universal2.whl → 1.0.19rc7__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.19rc5__cp310-cp310-macosx_14_0_universal2.whl → 1.0.19rc7__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nexaai might be problematic. Click here for more details.

Files changed (221) hide show

nexaai/binds/nexa_mlx/py-lib/llm/main.py ADDED Viewed

@@ -0,0 +1,68 @@
+# Copyright © 2024 Apple Inc.
+from mlx_lm import generate, load
+def test_llm_generate_stream(model_path):
+    # Load the corresponding model and tokenizer
+    model, tokenizer = load(path_or_hf_repo=model_path)
+    # Conversation history to maintain context
+    conversation = []
+    # Specify the maximum number of tokens
+    max_tokens = 1_000
+    # Specify if tokens and timing information will be printed
+    verbose = True
+    print("Multi-round conversation started. Type 'quit' or 'exit' to end.")
+    print("=" * 50)
+    while True:
+        # Get user input
+        user_input = input("\nUser: ").strip()
+        # Check for exit commands
+        if user_input.lower() in ["quit", "exit", "q"]:
+            print("Goodbye!")
+            break
+        if not user_input:
+            continue
+        # Add user input to conversation history
+        conversation.append({"role": "user", "content": user_input})
+        # Transform the conversation into the chat template
+        prompt = tokenizer.apply_chat_template(
+            conversation=conversation, add_generation_prompt=True
+        )
+        # Generate response
+        print("Assistant: ", end="", flush=True)
+        # Generate text, already handled KV cache
+        response = generate(
+            model=model,
+            tokenizer=tokenizer,
+            prompt=prompt,
+            max_tokens=max_tokens,
+            verbose=verbose,
+        )
+        # Extract the generated text (response includes the prompt)
+        generated_text = response.strip()
+        # Add assistant response to conversation history
+        conversation.append({"role": "assistant", "content": generated_text})
+        print()  # New line after response
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default="mlx-community/Qwen3-1.7B-4bit-DWQ")
+    args = parser.parse_args()
+    test_llm_generate_stream(args.model_path)

nexaai/binds/nexa_mlx/py-lib/rerank/__init__.py ADDED Viewed

File without changes

nexaai/binds/nexa_mlx/py-lib/rerank/generate.py ADDED Viewed

@@ -0,0 +1,174 @@
+# Copyright © Nexa AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+import time
+from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download
+from .modeling.nexa_jina_rerank import Model, ModelArgs
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """Create position ids from input ids, accounting for padding tokens"""
+    mask = (input_ids != padding_idx).astype(mx.int32)
+    incremental_indices = (mx.cumsum(mask, axis=1) + past_key_values_length) * mask
+    return incremental_indices.astype(mx.int32) + padding_idx
+def prepare_inputs(query, documents, tokenizer, max_length=1024):
+    """Prepare inputs for the model - match torch exactly"""
+    sentence_pairs = [[query, doc] for doc in documents]
+    inputs = tokenizer(
+        sentence_pairs,
+        padding="max_length",
+        truncation=True,
+        return_tensors="np",
+        max_length=max_length,
+    )
+    input_ids = mx.array(inputs["input_ids"]).astype(mx.int32)
+    seqlen = input_ids.shape[1]
+    attention_mask = mx.array(inputs["attention_mask"]).astype(mx.float32)
+    # Create token_type_ids as 1D tensor like torch, then broadcast for each batch item
+    token_type_ids_1d = mx.zeros(seqlen, dtype=mx.int32)
+    batch_size = input_ids.shape[0]
+    token_type_ids = mx.broadcast_to(
+        mx.expand_dims(token_type_ids_1d, axis=0), (batch_size, seqlen)
+    )
+    # Create position ids for each sequence in the batch
+    position_ids = create_position_ids_from_input_ids(input_ids, padding_idx=1)
+    return input_ids, attention_mask, token_type_ids, position_ids
+def load_model(model_id):
+    """Initialize and load the Jina V2 rerank model."""
+    curr_dir = os.path.dirname(os.path.abspath(__file__))
+    model_dir = f"{curr_dir}/modelfiles/nexaml_jina_v2_rerank_mlx"
+    # Download model if not exists
+    if not os.path.exists(model_dir):
+        print(f"Downloading model {model_id}...")
+        os.makedirs(model_dir, exist_ok=True)
+        try:
+            snapshot_download(
+                repo_id=model_id,
+                allow_patterns=["*.safetensors", "config.json", "tokenizer*"],
+                local_dir=model_dir,
+                local_dir_use_symlinks=False
+            )
+            print("Model download completed!")
+        except Exception as e:
+            print(f"Failed to download model: {e}")
+            print("Try: huggingface-cli login (if authentication required)")
+            raise
+    # Create model config
+    config = ModelArgs()
+    model = Model(config)
+    # Load weights
+    weight_file = os.path.join(model_dir, "model.safetensors")
+    if not os.path.exists(weight_file):
+        # Try alternative naming patterns
+        safetensors_files = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
+        if safetensors_files:
+            weight_file = os.path.join(model_dir, safetensors_files[0])
+        else:
+            raise FileNotFoundError(f"No .safetensors file found in {model_dir}")
+    print(f"Loading weights from: {weight_file}")
+    model.load_weights(weight_file, strict=True)
+    model.eval()
+    return model, model_dir
+def load_tokenizer(model_path):
+    """Load and configure the tokenizer."""
+    return AutoTokenizer.from_pretrained(model_path)
+def rerank_documents(model, tokenizer, query, documents, max_length=1024):
+    """Rerank documents based on query relevance."""
+    # Prepare inputs
+    input_ids, attention_mask, token_type_ids, position_ids = prepare_inputs(
+        query, documents, tokenizer, max_length
+    )
+    # Run inference
+    start_time = time.time()
+    scores = model.nexa_forward(input_ids, attention_mask, token_type_ids, position_ids)
+    scores = mx.squeeze(scores, axis=-1)
+    end_time = time.time()
+    # Apply sigmoid to get probabilities
+    scores_sigmoid = mx.sigmoid(scores)
+    inference_time = (end_time - start_time) * 1000  # Convert to ms
+    return scores, scores_sigmoid, inference_time
+def main(model_id):
+    """Main function to handle reranking demonstration."""
+    # Load model and tokenizer
+    model, model_path = load_model(model_id)
+    tokenizer = load_tokenizer(model_path)
+    # Example query and documents
+    query = "What are the health benefits of green tea?"
+    documents = [
+        "Green tea is rich in antioxidants and may improve brain function.",
+        "Coffee contains caffeine and can boost energy levels.",
+        "Das Trinken von grünem Tee kann das Risiko für Herzkrankheiten senken.",
+        "Black tea is another popular beverage with its own health benefits.",
+    ]
+    # Perform reranking
+    scores, scores_sigmoid, inference_time = rerank_documents(
+        model, tokenizer, query, documents
+    )
+    # Display results
+    print("=" * 70)
+    print("Reranking Results:")
+    print("=" * 70)
+    print(f"Query: {query}")
+    print()
+    for i, (doc, score, prob) in enumerate(zip(documents, scores.tolist(), scores_sigmoid.tolist())):
+        print(f"Document {i+1}:")
+        print(f"  Text: {doc}")
+        print(f"  Score: {score:.4f}")
+        print(f"  Probability: {prob:.4f}")
+        print()
+    print(f"Inference time: {inference_time:.1f}ms")
+    print(f"Throughput: {len(documents)/inference_time*1000:.1f} docs/s")
+if __name__ == "__main__":
+    model_id = "nexaml/jina-v2-rerank-mlx"
+    main(model_id)

nexaai/binds/nexa_mlx/py-lib/rerank/interface.py ADDED Viewed

@@ -0,0 +1,287 @@
+# Copyright © Nexa AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.s
+from __future__ import annotations
+import os
+import json
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+import time
+from pathlib import Path
+from typing import Any, List, Optional, Sequence
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+# Import necessary modules
+from transformers import AutoTokenizer
+# Import from ml.py for API alignment (assuming similar structure)
+try:
+    from ml import (
+        Reranker as BaseReranker,
+        Path as PathType,
+    )
+except ImportError:
+    # Fallback to local definitions if ml.py not available
+    PathType = Path
+    BaseReranker = ABC
+# Import profiling module
+from profiling import ProfilingMixin, ProfilingData, StopReason
+# Import the model implementation
+from .modeling.nexa_jina_rerank import Model, ModelArgs
+@dataclass
+class RerankConfig:
+    """Configuration for reranking."""
+    batch_size: int = 1
+    normalize: bool = True
+    normalize_method: str = "softmax"  # "softmax" | "min-max" | "none"
+    def __init__(
+        self,
+        batch_size: int = 1,
+        normalize: bool = True,
+        normalize_method: str = "softmax",
+    ) -> None:
+        self.batch_size = batch_size
+        self.normalize = normalize
+        self.normalize_method = normalize_method
+class Reranker(BaseReranker, ProfilingMixin):
+    """
+    Reranker interface for MLX reranking models.
+    API aligned with ml.py Reranker abstract base class.
+    """
+    def __init__(
+        self,
+        model_path: PathType,
+        tokenizer_path: PathType,
+        device: Optional[str] = None,
+    ) -> None:
+        """Initialize the Reranker model."""
+        # Initialize profiling mixin
+        ProfilingMixin.__init__(self)
+        # Store paths
+        if (os.path.isfile(model_path)):
+            model_path = os.path.dirname(model_path)
+        # Call parent constructor if inheriting from ml.py
+        if hasattr(super(), '__init__'):
+            super().__init__(model_path, tokenizer_path, device)
+        # Store paths and device
+        self.model_path = model_path
+        self.tokenizer_path = tokenizer_path
+        self.device = device if device is not None else "cpu"
+        # Initialize model and tokenizer as None
+        self.model = None
+        self.tokenizer = None
+        self.config = None
+    def destroy(self) -> None:
+        """Destroy the model and free resources."""
+        self.model = None
+        self.tokenizer = None
+        self.config = None
+    def load_model(self, model_path: PathType, extra_data: Any = None) -> bool:
+        """Load model from path."""
+        try:
+            # Use the provided model_path or fall back to instance path
+            if model_path:
+                # Apply same file-to-directory conversion as in __init__
+                if os.path.isfile(model_path):
+                    model_path = os.path.dirname(model_path)
+                self.model_path = model_path
+            # Load the model using internal implementation
+            self.model = self._load_jina_model(self.model_path)
+            self.tokenizer = self._load_tokenizer()
+            return True
+        except Exception as e:
+            print(f"Failed to load model: {e}")
+            return False
+    def close(self) -> None:
+        """Close the model."""
+        self.destroy()
+    def rerank(
+        self,
+        query: str,
+        documents: Sequence[str],
+        config: Optional[RerankConfig] = None,
+        clear_cache: bool = True,
+    ) -> mx.array:
+        """Rerank documents given a query."""
+        if self.model is None or self.tokenizer is None:
+            raise RuntimeError("Model not loaded. Call load_model() first.")
+        if config is None:
+            config = RerankConfig()
+        # Start profiling
+        self._start_profiling()
+        self._prompt_start()
+        all_scores = []
+        # Process documents in batches
+        batch_size = config.batch_size
+        for i in range(0, len(documents), batch_size):
+            batch_docs = documents[i:i + batch_size]
+            batch_scores = self._rerank_batch(query, batch_docs, config)
+            all_scores.append(batch_scores)
+        if clear_cache:
+            mx.clear_cache()
+        # End prompt processing, start decode
+        self._prompt_end()
+        self._decode_start()
+        # Concatenate all batch scores into a single array
+        res = mx.concatenate(all_scores, axis=0) if len(all_scores) > 1 else all_scores[0]
+        # End decode and profiling
+        self._decode_end()
+        self._set_stop_reason(StopReason.ML_STOP_REASON_COMPLETED)
+        self._end_profiling()
+        return res
+    def _load_jina_model(self, model_dir: str) -> Model:
+        """Initialize and load the Jina V2 rerank model."""
+        # Validate that model path exists
+        if not os.path.exists(model_dir):
+            raise ValueError(f"Model path does not exist: {model_dir}")
+        # Store model directory for tokenizer loading
+        self._model_dir = model_dir
+        # Create model config
+        config = ModelArgs()
+        model = Model(config)
+        # Load weights
+        weight_file = os.path.join(model_dir, "model.safetensors")
+        if not os.path.exists(weight_file):
+            # Try alternative naming patterns
+            safetensors_files = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
+            if safetensors_files:
+                weight_file = os.path.join(model_dir, safetensors_files[0])
+            else:
+                raise FileNotFoundError(f"No .safetensors file found in {model_dir}")
+        model.load_weights(weight_file, strict=True)
+        model.eval()
+        return model
+    def _load_tokenizer(self) -> AutoTokenizer:
+        """Load and configure the tokenizer."""
+        return AutoTokenizer.from_pretrained(self._model_dir)
+    def _rerank_batch(self, query: str, documents: List[str], config: RerankConfig) -> mx.array:
+        """Rerank a batch of documents and return their scores."""
+        # Prepare inputs
+        input_ids, attention_mask, token_type_ids, position_ids = self._prepare_inputs(
+            query, documents, self.tokenizer, max_length=1024
+        )
+        # Run inference
+        scores = self.model.nexa_forward(input_ids, attention_mask, token_type_ids, position_ids)
+        scores = mx.squeeze(scores, axis=-1)
+        # Apply normalization if requested
+        if config.normalize:
+            scores = self._normalize_scores(scores, config.normalize_method)
+        return scores
+    def _create_position_ids_from_input_ids(self, input_ids, padding_idx, past_key_values_length=0):
+        """Create position ids from input ids, accounting for padding tokens"""
+        mask = (input_ids != padding_idx).astype(mx.int32)
+        incremental_indices = (mx.cumsum(mask, axis=1) + past_key_values_length) * mask
+        return incremental_indices.astype(mx.int32) + padding_idx
+    def _prepare_inputs(self, query, documents, tokenizer, max_length=1024):
+        """Prepare inputs for the model - match torch exactly"""
+        sentence_pairs = [[query, doc] for doc in documents]
+        inputs = tokenizer(
+            sentence_pairs,
+            padding="max_length",
+            truncation=True,
+            return_tensors="np",
+            max_length=max_length,
+        )
+        input_ids = mx.array(inputs["input_ids"]).astype(mx.int32)
+        seqlen = input_ids.shape[1]
+        attention_mask = mx.array(inputs["attention_mask"]).astype(mx.float32)
+        # Create token_type_ids as 1D tensor like torch, then broadcast for each batch item
+        token_type_ids_1d = mx.zeros(seqlen, dtype=mx.int32)
+        batch_size = input_ids.shape[0]
+        token_type_ids = mx.broadcast_to(
+            mx.expand_dims(token_type_ids_1d, axis=0), (batch_size, seqlen)
+        )
+        # Create position ids for each sequence in the batch
+        position_ids = self._create_position_ids_from_input_ids(input_ids, padding_idx=1)
+        return input_ids, attention_mask, token_type_ids, position_ids
+    def _normalize_scores(self, scores: mx.array, method: str) -> mx.array:
+        """Normalize scores using specified method."""
+        if method == "none":
+            return scores
+        elif method == "softmax":
+            # For 1D arrays, use axis=0; for higher dims, use axis=-1
+            if len(scores.shape) == 1:
+                return mx.softmax(scores, axis=0)
+            else:
+                return mx.softmax(scores, axis=-1)
+        elif method == "min-max":
+            min_val = mx.min(scores)
+            max_val = mx.max(scores)
+            if max_val > min_val:
+                return (scores - min_val) / (max_val - min_val)
+            return scores
+        else:
+            return scores
+# Factory function for creating reranker instances
+def create_reranker(
+    model_path: PathType,
+    tokenizer_path: Optional[PathType] = None,
+    device: Optional[str] = None,
+) -> Reranker:
+    """Create and return a Reranker instance."""
+    if tokenizer_path is None:
+        tokenizer_path = model_path
+    return Reranker(model_path, tokenizer_path, device)

nexaai/binds/nexa_mlx/py-lib/rerank/main.py ADDED Viewed

@@ -0,0 +1,127 @@
+# Copyright © Nexa AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import mlx.core as mx
+from .interface import create_reranker, RerankConfig
+def test_reranking():
+    """Test reranking model functionality."""
+    # Create reranker instance
+    model_path = "nexaml/jina-v2-rerank-mlx"
+    reranker = create_reranker(model_path=model_path)
+    # Load the model
+    print("Loading reranking model...")
+    success = reranker.load_model(model_path, extra_data="nexaml/jina-v2-rerank-mlx")
+    if not success:
+        print("Failed to load model!")
+        return
+    print("✅ Model loaded successfully!")
+    # Test query and documents (same as generate.py)
+    query = "What are the health benefits of green tea?"
+    documents = [
+        "Green tea is rich in antioxidants and may improve brain function.",
+        "Coffee contains caffeine and can boost energy levels.",
+        "Das Trinken von grünem Tee kann das Risiko für Herzkrankheiten senken.",
+        "Black tea is another popular beverage with its own health benefits.",
+    ]
+    # Configure reranking with no normalization to get raw scores
+    config = RerankConfig(
+        batch_size=len(documents),
+        normalize=False,
+        normalize_method="none"
+    )
+    # Generate reranking scores
+    start_time = time.time()
+    scores = reranker.rerank(query, documents, config)
+    end_time = time.time()
+    # Calculate sigmoid probabilities manually
+    scores_sigmoid = mx.sigmoid(scores).tolist()
+    inference_time = (end_time - start_time) * 1000  # Convert to ms
+    print("=" * 70)
+    print("Reranking Results:")
+    print("=" * 70)
+    print(f"Query: {query}")
+    print()
+    for i, (doc, score, prob) in enumerate(zip(documents, scores.tolist(), scores_sigmoid)):
+        print(f"Document {i+1}:")
+        print(f"  Text: {doc}")
+        print(f"  Score: {score:.4f}")
+        print(f"  Probability: {prob:.4f}")
+        print()
+    print(f"Inference time: {inference_time:.1f}ms")
+    print(f"Throughput: {len(documents)/inference_time*1000:.1f} docs/s")
+    # Cleanup
+    reranker.close()
+def main(model_id):
+    """Main function to handle reranking demonstration - aligned with embedding generate.py format."""
+    # Create reranker instance
+    reranker = create_reranker(model_path=model_id)
+    # Load the model
+    success = reranker.load_model(model_id, extra_data=model_id)
+    if not success:
+        print("Failed to load model!")
+        return
+    # Simple test like embedding generate.py
+    query = "What are the health benefits of green tea?"
+    documents = [
+        "Green tea is rich in antioxidants and may improve brain function.",
+        "Coffee contains caffeine and can boost energy levels.",
+    ]
+    # Get raw scores
+    config = RerankConfig(normalize=False)
+    scores = reranker.rerank(query, documents, config)
+    # Calculate statistics on raw MLX array
+    scores_sigmoid = mx.sigmoid(scores)
+    print(f"Scores shape: {scores.shape}")
+    print(f"Score sample values: {scores.tolist()}")
+    print(f"Scores min: {scores.min():.4f}, Max: {scores.max():.4f}, Mean: {scores.mean():.4f}, Std: {scores.std():.4f}")
+    print(f"Sigmoid probabilities: {scores_sigmoid.tolist()}")
+    # Cleanup
+    reranker.close()
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default="nexaml/jina-v2-rerank-mlx")
+    args = parser.parse_args()
+    # Use test_reranking for comprehensive test, main for simple format like generate.py
+    if hasattr(args, 'simple') and args.simple:
+        main(args.model_path)
+    else:
+        test_reranking()

nexaai/binds/nexa_mlx/py-lib/rerank/modeling/__init__.py ADDED Viewed

File without changes