PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/profiling.py ADDED Viewed

@@ -0,0 +1,239 @@
+from __future__ import annotations
+import time
+from dataclasses import dataclass, field
+from typing import Any, Optional
+from enum import IntEnum
+# --------------------------------------------------------------------------------------
+# Stop reason constants matching profile.h
+# --------------------------------------------------------------------------------------
+class StopReason(IntEnum):
+    """Stop reason constants matching profile.h"""
+    ML_STOP_REASON_UNKNOWN = 0
+    ML_STOP_REASON_EOS = 1
+    ML_STOP_REASON_LENGTH = 2
+    ML_STOP_REASON_USER = 3
+    ML_STOP_REASON_STOP_SEQUENCE = 4
+    ML_STOP_REASON_COMPLETED = 5
+# --------------------------------------------------------------------------------------
+# Profiling data structure
+# --------------------------------------------------------------------------------------
+@dataclass
+class ProfilingData:
+    """Profiling data for performance metrics."""
+    ttft_us: int = 0             # Time to first token (us)
+    total_time_us: int = 0       # Total generation time (us)
+    prompt_time_us: int = 0      # Prompt processing time (us)
+    decode_time_us: int = 0      # Token generation time (us)
+    tokens_per_second: float = 0.0  # Decoding speed (tokens/sec)
+    total_tokens: int = 0        # Total tokens generated
+    prompt_tokens: int = 0       # Number of prompt tokens
+    generated_tokens: int = 0    # Number of generated tokens
+    stop_reason: int = StopReason.ML_STOP_REASON_UNKNOWN  # Stop reason (numeric)
+    def reset(self):
+        """Reset all profiling data."""
+        self.ttft_us = 0
+        self.total_time_us = 0
+        self.prompt_time_us = 0
+        self.decode_time_us = 0
+        self.tokens_per_second = 0.0
+        self.total_tokens = 0
+        self.prompt_tokens = 0
+        self.generated_tokens = 0
+        self.stop_reason = StopReason.ML_STOP_REASON_UNKNOWN
+# --------------------------------------------------------------------------------------
+# Profiling context (similar to ml_ProfilingContext in profile.h)
+# --------------------------------------------------------------------------------------
+@dataclass
+class ProfilingContext:
+    """Profiling context for tracking timing and state."""
+    start_time: Optional[float] = None
+    prompt_start_time: Optional[float] = None
+    prompt_end_time: Optional[float] = None
+    decode_start_time: Optional[float] = None
+    decode_end_time: Optional[float] = None
+    first_token_time: Optional[float] = None
+    end_time: Optional[float] = None
+    ttft_recorded: bool = False
+    stop_reason: int = StopReason.ML_STOP_REASON_UNKNOWN
+    prompt_tokens: int = 0
+    generated_tokens: int = 0
+    def reset(self):
+        """Reset profiling context."""
+        self.start_time = None
+        self.prompt_start_time = None
+        self.prompt_end_time = None
+        self.decode_start_time = None
+        self.decode_end_time = None
+        self.first_token_time = None
+        self.end_time = None
+        self.ttft_recorded = False
+        self.stop_reason = StopReason.ML_STOP_REASON_UNKNOWN
+        self.prompt_tokens = 0
+        self.generated_tokens = 0
+# --------------------------------------------------------------------------------------
+# Profiling functions (similar to profile.h functions)
+# --------------------------------------------------------------------------------------
+def profiling_reset(ctx: ProfilingContext) -> None:
+    """Reset profiling context (ml_profiling_reset)."""
+    ctx.reset()
+def profiling_start(ctx: ProfilingContext) -> None:
+    """Start profiling (ml_profiling_start)."""
+    ctx.start_time = time.perf_counter()
+    ctx.prompt_start_time = ctx.start_time
+def profiling_prompt_start(ctx: ProfilingContext) -> None:
+    """Start prompt processing timing (ml_profiling_prompt_start)."""
+    ctx.prompt_start_time = time.perf_counter()
+def profiling_prompt_end(ctx: ProfilingContext) -> None:
+    """End prompt processing timing (ml_profiling_prompt_end)."""
+    ctx.prompt_end_time = time.perf_counter()
+def profiling_decode_start(ctx: ProfilingContext) -> None:
+    """Start decode timing (ml_profiling_decode_start)."""
+    ctx.decode_start_time = time.perf_counter()
+def profiling_decode_end(ctx: ProfilingContext) -> None:
+    """End decode timing (ml_profiling_decode_end)."""
+    ctx.decode_end_time = time.perf_counter()
+def profiling_record_ttft(ctx: ProfilingContext) -> None:
+    """Record time to first token (ml_profiling_record_ttft)."""
+    if not ctx.ttft_recorded and ctx.start_time is not None:
+        ctx.first_token_time = time.perf_counter()
+        ctx.ttft_recorded = True
+def profiling_update_prompt_tokens(ctx: ProfilingContext, prompt_tokens: int) -> None:
+    """Update prompt token count (ml_profiling_update_prompt_tokens)."""
+    ctx.prompt_tokens = prompt_tokens
+def profiling_update_generated_tokens(ctx: ProfilingContext, generated_tokens: int) -> None:
+    """Update generated token count (ml_profiling_update_generated_tokens)."""
+    ctx.generated_tokens = generated_tokens
+def profiling_stop_reason(ctx: ProfilingContext, stop_reason: int) -> None:
+    """Set stop reason (ml_profiling_stop_reason)."""
+    ctx.stop_reason = stop_reason
+def profiling_end(ctx: ProfilingContext) -> None:
+    """End profiling (ml_profiling_end)."""
+    ctx.end_time = time.perf_counter()
+def profiling_gen_data(ctx: ProfilingContext) -> ProfilingData:
+    """Generate profiling data from context (ml_profiling_gen_data)."""
+    data = ProfilingData()
+    if ctx.start_time is None or ctx.end_time is None:
+        return data
+    # Calculate total time
+    data.total_time_us = int((ctx.end_time - ctx.start_time) * 1_000_000)
+    # Calculate prompt time
+    if ctx.prompt_start_time is not None and ctx.prompt_end_time is not None:
+        data.prompt_time_us = int((ctx.prompt_end_time - ctx.prompt_start_time) * 1_000_000)
+    # Calculate decode time
+    if ctx.decode_start_time is not None and ctx.decode_end_time is not None:
+        data.decode_time_us = int((ctx.decode_end_time - ctx.decode_start_time) * 1_000_000)
+    # Calculate TTFT
+    if ctx.first_token_time is not None and ctx.start_time is not None:
+        data.ttft_us = int((ctx.first_token_time - ctx.start_time) * 1_000_000)
+    # Set token counts
+    data.prompt_tokens = ctx.prompt_tokens
+    data.generated_tokens = ctx.generated_tokens
+    data.total_tokens = ctx.prompt_tokens + ctx.generated_tokens
+    # Calculate tokens per second
+    if data.decode_time_us > 0:
+        data.tokens_per_second = (data.generated_tokens * 1_000_000.0) / data.decode_time_us
+    # Set stop reason
+    data.stop_reason = ctx.stop_reason
+    return data
+def stop_reason_to_string(reason: int) -> str:
+    """Convert stop reason to string (stop_reason_to_string)."""
+    try:
+        return StopReason(reason).name
+    except ValueError:
+        return f"UNKNOWN({reason})"
+# --------------------------------------------------------------------------------------
+# Profiling mixin for model classes
+# --------------------------------------------------------------------------------------
+class ProfilingMixin:
+    """Mixin class to add profiling capabilities to model classes."""
+    def __init__(self):
+        """Initialize profiling mixin."""
+        self._profiling_context = ProfilingContext()
+        self._profiling_data = ProfilingData()
+    def _start_profiling(self) -> None:
+        """Start profiling for an operation."""
+        profiling_reset(self._profiling_context)
+        profiling_start(self._profiling_context)
+    def _prompt_start(self) -> None:
+        """Start prompt processing timing."""
+        profiling_prompt_start(self._profiling_context)
+    def _prompt_end(self) -> None:
+        """End prompt processing timing."""
+        profiling_prompt_end(self._profiling_context)
+    def _decode_start(self) -> None:
+        """Start decode timing."""
+        profiling_decode_start(self._profiling_context)
+    def _decode_end(self) -> None:
+        """End decode timing."""
+        profiling_decode_end(self._profiling_context)
+    def _record_ttft(self) -> None:
+        """Record time to first token."""
+        profiling_record_ttft(self._profiling_context)
+    def _update_prompt_tokens(self, prompt_tokens: int) -> None:
+        """Update prompt token count."""
+        profiling_update_prompt_tokens(self._profiling_context, prompt_tokens)
+    def _update_generated_tokens(self, generated_tokens: int) -> None:
+        """Update generated token count."""
+        profiling_update_generated_tokens(self._profiling_context, generated_tokens)
+    def _set_stop_reason(self, stop_reason: int) -> None:
+        """Set stop reason."""
+        profiling_stop_reason(self._profiling_context, stop_reason)
+    def _end_profiling(self) -> ProfilingData:
+        """End profiling and return data."""
+        profiling_end(self._profiling_context)
+        self._profiling_data = profiling_gen_data(self._profiling_context)
+        return self._profiling_data
+    def get_profiling_data(self) -> ProfilingData:
+        """Get profiling data for the last operation."""
+        return self._profiling_data
+    def reset_profiling(self) -> None:
+        """Reset profiling data."""
+        self._profiling_data.reset()

nexaai/mlx_backend/rerank/__init__.py ADDED Viewed

File without changes

nexaai/mlx_backend/rerank/generate.py ADDED Viewed

@@ -0,0 +1,174 @@
+# Copyright © Nexa AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+import time
+from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download
+from .modeling.nexa_jina_rerank import Model, ModelArgs
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """Create position ids from input ids, accounting for padding tokens"""
+    mask = (input_ids != padding_idx).astype(mx.int32)
+    incremental_indices = (mx.cumsum(mask, axis=1) + past_key_values_length) * mask
+    return incremental_indices.astype(mx.int32) + padding_idx
+def prepare_inputs(query, documents, tokenizer, max_length=1024):
+    """Prepare inputs for the model - match torch exactly"""
+    sentence_pairs = [[query, doc] for doc in documents]
+    inputs = tokenizer(
+        sentence_pairs,
+        padding="max_length",
+        truncation=True,
+        return_tensors="np",
+        max_length=max_length,
+    )
+    input_ids = mx.array(inputs["input_ids"]).astype(mx.int32)
+    seqlen = input_ids.shape[1]
+    attention_mask = mx.array(inputs["attention_mask"]).astype(mx.float32)
+    # Create token_type_ids as 1D tensor like torch, then broadcast for each batch item
+    token_type_ids_1d = mx.zeros(seqlen, dtype=mx.int32)
+    batch_size = input_ids.shape[0]
+    token_type_ids = mx.broadcast_to(
+        mx.expand_dims(token_type_ids_1d, axis=0), (batch_size, seqlen)
+    )
+    # Create position ids for each sequence in the batch
+    position_ids = create_position_ids_from_input_ids(input_ids, padding_idx=1)
+    return input_ids, attention_mask, token_type_ids, position_ids
+def load_model(model_id):
+    """Initialize and load the Jina V2 rerank model."""
+    curr_dir = os.path.dirname(os.path.abspath(__file__))
+    model_dir = f"{curr_dir}/modelfiles/nexaml_jina_v2_rerank_mlx"
+    # Download model if not exists
+    if not os.path.exists(model_dir):
+        print(f"Downloading model {model_id}...")
+        os.makedirs(model_dir, exist_ok=True)
+        try:
+            snapshot_download(
+                repo_id=model_id,
+                allow_patterns=["*.safetensors", "config.json", "tokenizer*"],
+                local_dir=model_dir,
+                local_dir_use_symlinks=False
+            )
+            print("Model download completed!")
+        except Exception as e:
+            print(f"Failed to download model: {e}")
+            print("Try: huggingface-cli login (if authentication required)")
+            raise
+    # Create model config
+    config = ModelArgs()
+    model = Model(config)
+    # Load weights
+    weight_file = os.path.join(model_dir, "model.safetensors")
+    if not os.path.exists(weight_file):
+        # Try alternative naming patterns
+        safetensors_files = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
+        if safetensors_files:
+            weight_file = os.path.join(model_dir, safetensors_files[0])
+        else:
+            raise FileNotFoundError(f"No .safetensors file found in {model_dir}")
+    print(f"Loading weights from: {weight_file}")
+    model.load_weights(weight_file, strict=True)
+    model.eval()
+    return model, model_dir
+def load_tokenizer(model_path):
+    """Load and configure the tokenizer."""
+    return AutoTokenizer.from_pretrained(model_path)
+def rerank_documents(model, tokenizer, query, documents, max_length=1024):
+    """Rerank documents based on query relevance."""
+    # Prepare inputs
+    input_ids, attention_mask, token_type_ids, position_ids = prepare_inputs(
+        query, documents, tokenizer, max_length
+    )
+    # Run inference
+    start_time = time.time()
+    scores = model.nexa_forward(input_ids, attention_mask, token_type_ids, position_ids)
+    scores = mx.squeeze(scores, axis=-1)
+    end_time = time.time()
+    # Apply sigmoid to get probabilities
+    scores_sigmoid = mx.sigmoid(scores)
+    inference_time = (end_time - start_time) * 1000  # Convert to ms
+    return scores, scores_sigmoid, inference_time
+def main(model_id):
+    """Main function to handle reranking demonstration."""
+    # Load model and tokenizer
+    model, model_path = load_model(model_id)
+    tokenizer = load_tokenizer(model_path)
+    # Example query and documents
+    query = "What are the health benefits of green tea?"
+    documents = [
+        "Green tea is rich in antioxidants and may improve brain function.",
+        "Coffee contains caffeine and can boost energy levels.",
+        "Das Trinken von grünem Tee kann das Risiko für Herzkrankheiten senken.",
+        "Black tea is another popular beverage with its own health benefits.",
+    ]
+    # Perform reranking
+    scores, scores_sigmoid, inference_time = rerank_documents(
+        model, tokenizer, query, documents
+    )
+    # Display results
+    print("=" * 70)
+    print("Reranking Results:")
+    print("=" * 70)
+    print(f"Query: {query}")
+    print()
+    for i, (doc, score, prob) in enumerate(zip(documents, scores.tolist(), scores_sigmoid.tolist())):
+        print(f"Document {i+1}:")
+        print(f"  Text: {doc}")
+        print(f"  Score: {score:.4f}")
+        print(f"  Probability: {prob:.4f}")
+        print()
+    print(f"Inference time: {inference_time:.1f}ms")
+    print(f"Throughput: {len(documents)/inference_time*1000:.1f} docs/s")
+if __name__ == "__main__":
+    model_id = "nexaml/jina-v2-rerank-mlx"
+    main(model_id)

nexaai/mlx_backend/rerank/interface.py ADDED Viewed

@@ -0,0 +1,287 @@
+# Copyright © Nexa AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.s
+from __future__ import annotations
+import os
+import json
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+import time
+from pathlib import Path
+from typing import Any, List, Optional, Sequence
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+# Import necessary modules
+from transformers import AutoTokenizer
+# Import from ml.py for API alignment (assuming similar structure)
+try:
+    from ml import (
+        Reranker as BaseReranker,
+        Path as PathType,
+    )
+except ImportError:
+    # Fallback to local definitions if ml.py not available
+    PathType = Path
+    BaseReranker = ABC
+# Import profiling module
+from profiling import ProfilingMixin, ProfilingData, StopReason
+# Import the model implementation
+from .modeling.nexa_jina_rerank import Model, ModelArgs
+@dataclass
+class RerankConfig:
+    """Configuration for reranking."""
+    batch_size: int = 1
+    normalize: bool = True
+    normalize_method: str = "softmax"  # "softmax" | "min-max" | "none"
+    def __init__(
+        self,
+        batch_size: int = 1,
+        normalize: bool = True,
+        normalize_method: str = "softmax",
+    ) -> None:
+        self.batch_size = batch_size
+        self.normalize = normalize
+        self.normalize_method = normalize_method
+class Reranker(BaseReranker, ProfilingMixin):
+    """
+    Reranker interface for MLX reranking models.
+    API aligned with ml.py Reranker abstract base class.
+    """
+    def __init__(
+        self,
+        model_path: PathType,
+        tokenizer_path: PathType,
+        device: Optional[str] = None,
+    ) -> None:
+        """Initialize the Reranker model."""
+        # Initialize profiling mixin
+        ProfilingMixin.__init__(self)
+        # Store paths
+        if (os.path.isfile(model_path)):
+            model_path = os.path.dirname(model_path)
+        # Call parent constructor if inheriting from ml.py
+        if hasattr(super(), '__init__'):
+            super().__init__(model_path, tokenizer_path, device)
+        # Store paths and device
+        self.model_path = model_path
+        self.tokenizer_path = tokenizer_path
+        self.device = device if device is not None else "cpu"
+        # Initialize model and tokenizer as None
+        self.model = None
+        self.tokenizer = None
+        self.config = None
+    def destroy(self) -> None:
+        """Destroy the model and free resources."""
+        self.model = None
+        self.tokenizer = None
+        self.config = None
+    def load_model(self, model_path: PathType, extra_data: Any = None) -> bool:
+        """Load model from path."""
+        try:
+            # Use the provided model_path or fall back to instance path
+            if model_path:
+                # Apply same file-to-directory conversion as in __init__
+                if os.path.isfile(model_path):
+                    model_path = os.path.dirname(model_path)
+                self.model_path = model_path
+            # Load the model using internal implementation
+            self.model = self._load_jina_model(self.model_path)
+            self.tokenizer = self._load_tokenizer()
+            return True
+        except Exception as e:
+            print(f"Failed to load model: {e}")
+            return False
+    def close(self) -> None:
+        """Close the model."""
+        self.destroy()
+    def rerank(
+        self,
+        query: str,
+        documents: Sequence[str],
+        config: Optional[RerankConfig] = None,
+        clear_cache: bool = True,
+    ) -> mx.array:
+        """Rerank documents given a query."""
+        if self.model is None or self.tokenizer is None:
+            raise RuntimeError("Model not loaded. Call load_model() first.")
+        if config is None:
+            config = RerankConfig()
+        # Start profiling
+        self._start_profiling()
+        self._prompt_start()
+        all_scores = []
+        # Process documents in batches
+        batch_size = config.batch_size
+        for i in range(0, len(documents), batch_size):
+            batch_docs = documents[i:i + batch_size]
+            batch_scores = self._rerank_batch(query, batch_docs, config)
+            all_scores.append(batch_scores)
+        if clear_cache:
+            mx.clear_cache()
+        # End prompt processing, start decode
+        self._prompt_end()
+        self._decode_start()
+        # Concatenate all batch scores into a single array
+        res = mx.concatenate(all_scores, axis=0) if len(all_scores) > 1 else all_scores[0]
+        # End decode and profiling
+        self._decode_end()
+        self._set_stop_reason(StopReason.ML_STOP_REASON_COMPLETED)
+        self._end_profiling()
+        return res
+    def _load_jina_model(self, model_dir: str) -> Model:
+        """Initialize and load the Jina V2 rerank model."""
+        # Validate that model path exists
+        if not os.path.exists(model_dir):
+            raise ValueError(f"Model path does not exist: {model_dir}")
+        # Store model directory for tokenizer loading
+        self._model_dir = model_dir
+        # Create model config
+        config = ModelArgs()
+        model = Model(config)
+        # Load weights
+        weight_file = os.path.join(model_dir, "model.safetensors")
+        if not os.path.exists(weight_file):
+            # Try alternative naming patterns
+            safetensors_files = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
+            if safetensors_files:
+                weight_file = os.path.join(model_dir, safetensors_files[0])
+            else:
+                raise FileNotFoundError(f"No .safetensors file found in {model_dir}")
+        model.load_weights(weight_file, strict=True)
+        model.eval()
+        return model
+    def _load_tokenizer(self) -> AutoTokenizer:
+        """Load and configure the tokenizer."""
+        return AutoTokenizer.from_pretrained(self._model_dir)
+    def _rerank_batch(self, query: str, documents: List[str], config: RerankConfig) -> mx.array:
+        """Rerank a batch of documents and return their scores."""
+        # Prepare inputs
+        input_ids, attention_mask, token_type_ids, position_ids = self._prepare_inputs(
+            query, documents, self.tokenizer, max_length=1024
+        )
+        # Run inference
+        scores = self.model.nexa_forward(input_ids, attention_mask, token_type_ids, position_ids)
+        scores = mx.squeeze(scores, axis=-1)
+        # Apply normalization if requested
+        if config.normalize:
+            scores = self._normalize_scores(scores, config.normalize_method)
+        return scores
+    def _create_position_ids_from_input_ids(self, input_ids, padding_idx, past_key_values_length=0):
+        """Create position ids from input ids, accounting for padding tokens"""
+        mask = (input_ids != padding_idx).astype(mx.int32)
+        incremental_indices = (mx.cumsum(mask, axis=1) + past_key_values_length) * mask
+        return incremental_indices.astype(mx.int32) + padding_idx
+    def _prepare_inputs(self, query, documents, tokenizer, max_length=1024):
+        """Prepare inputs for the model - match torch exactly"""
+        sentence_pairs = [[query, doc] for doc in documents]
+        inputs = tokenizer(
+            sentence_pairs,
+            padding="max_length",
+            truncation=True,
+            return_tensors="np",
+            max_length=max_length,
+        )
+        input_ids = mx.array(inputs["input_ids"]).astype(mx.int32)
+        seqlen = input_ids.shape[1]
+        attention_mask = mx.array(inputs["attention_mask"]).astype(mx.float32)
+        # Create token_type_ids as 1D tensor like torch, then broadcast for each batch item
+        token_type_ids_1d = mx.zeros(seqlen, dtype=mx.int32)
+        batch_size = input_ids.shape[0]
+        token_type_ids = mx.broadcast_to(
+            mx.expand_dims(token_type_ids_1d, axis=0), (batch_size, seqlen)
+        )
+        # Create position ids for each sequence in the batch
+        position_ids = self._create_position_ids_from_input_ids(input_ids, padding_idx=1)
+        return input_ids, attention_mask, token_type_ids, position_ids
+    def _normalize_scores(self, scores: mx.array, method: str) -> mx.array:
+        """Normalize scores using specified method."""
+        if method == "none":
+            return scores
+        elif method == "softmax":
+            # For 1D arrays, use axis=0; for higher dims, use axis=-1
+            if len(scores.shape) == 1:
+                return mx.softmax(scores, axis=0)
+            else:
+                return mx.softmax(scores, axis=-1)
+        elif method == "min-max":
+            min_val = mx.min(scores)
+            max_val = mx.max(scores)
+            if max_val > min_val:
+                return (scores - min_val) / (max_val - min_val)
+            return scores
+        else:
+            return scores
+# Factory function for creating reranker instances
+def create_reranker(
+    model_path: PathType,
+    tokenizer_path: Optional[PathType] = None,
+    device: Optional[str] = None,
+) -> Reranker:
+    """Create and return a Reranker instance."""
+    if tokenizer_path is None:
+        tokenizer_path = model_path
+    return Reranker(model_path, tokenizer_path, device)