PyPI - intextus-embed - Versions diffs - 0.1.0__py3-none-any.whl - Mend

intextus-embed 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

intextus/__init__.py +4 -0
intextus/encoder.py +205 -0
intextus/export.py +107 -0
intextus/utils.py +77 -0
intextus_embed-0.1.0.dist-info/METADATA +94 -0
intextus_embed-0.1.0.dist-info/RECORD +10 -0
intextus_embed-0.1.0.dist-info/WHEEL +5 -0
intextus_embed-0.1.0.dist-info/entry_points.txt +2 -0
intextus_embed-0.1.0.dist-info/licenses/LICENSE +21 -0
intextus_embed-0.1.0.dist-info/top_level.txt +1 -0

intextus/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .encoder import IntextusEncoder
+from .utils import compute_maxsim
+__all__ = ["IntextusEncoder", "compute_maxsim"]

intextus/encoder.py ADDED Viewed

@@ -0,0 +1,205 @@
+import os
+from typing import List, Union, Dict, Any
+import numpy as np
+import onnxruntime as ort
+from tokenizers import Tokenizer
+from intextus.utils import get_punctuation_token_ids
+class IntextusEncoder:
+    def __init__(
+        self,
+        model_name_or_path: str = "intextus/mxbai-edge-colbert-v0-17m-onnx",
+        tokenizer_path: str = None,
+        query_marker: str = "[Q]",
+        doc_marker: str = "[D]",
+        do_lower_case: bool = True,
+        provider: str = "CPUExecutionProvider"
+    ):
+        """
+        Pure ONNX engine for generic ColBERT execution.
+        Args:
+            model_name_or_path: Local path to a directory, an ONNX file, or a Hugging Face Hub model ID/alias.
+            tokenizer_path: Optional path to tokenizer.json. If None, it is resolved automatically.
+            query_marker: Special marker string used to denote query sequence.
+            doc_marker: Special marker string used to denote document sequence.
+            provider: Execution provider for ONNX Runtime inference.
+        """
+        # Resolve paths dynamically
+        model_path = None
+        if os.path.exists(model_name_or_path):
+            if os.path.isdir(model_name_or_path):
+                model_path = os.path.join(model_name_or_path, "model.onnx")
+                if tokenizer_path is None:
+                    tokenizer_path = os.path.join(model_name_or_path, "tokenizer.json")
+            else:
+                model_path = model_name_or_path
+                if tokenizer_path is None:
+                    dir_name = os.path.dirname(model_name_or_path)
+                    tokenizer_path = os.path.join(dir_name, "tokenizer.json")
+        else:
+            repo_id = model_name_or_path
+            supported_mappings = {
+                "mxbai-edge-colbert-v0-17m": "intextus/mxbai-edge-colbert-v0-17m-onnx",
+                "mxbai-edge-colbert-v0-32m": "intextus/mxbai-edge-colbert-v0-32m-onnx",
+                "lateon": "intextus/lateon-onnx"
+            }
+            if repo_id in supported_mappings:
+                repo_id = supported_mappings[repo_id]
+            try:
+                from huggingface_hub import hf_hub_download
+                print(f"Downloading model file from Hugging Face repository '{repo_id}'...")
+                model_path = hf_hub_download(repo_id=repo_id, filename="model.onnx")
+                if tokenizer_path is None:
+                    tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer.json")
+            except Exception as e:
+                raise ValueError(
+                    f"Could not load model '{model_name_or_path}' from local path or Hugging Face Hub.\n"
+                    f"Underlying error: {e}"
+                )
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(f"ONNX model file not found at {model_path}")
+        if tokenizer_path is None or not os.path.exists(tokenizer_path):
+            raise FileNotFoundError(f"Tokenizer file not found at {tokenizer_path}")
+        # Initialize the ultra-fast Rust tokenizer
+        self.tokenizer = Tokenizer.from_file(tokenizer_path)
+        # Initialize execution session
+        self.session = ort.InferenceSession(model_path, providers=[provider])
+        self.do_lower_case = do_lower_case
+        # Dynamically discover graph inputs/outputs to remain generic
+        self.input_names = [i.name for i in self.session.get_inputs()]
+        self.output_name = self.session.get_outputs()[0].name
+        # Fetch token IDs for ColBERT context injection (handling trailing space variants)
+        self.query_marker_id = self.tokenizer.token_to_id(query_marker)
+        if self.query_marker_id is None:
+            # Fallback for models (like PyLate/mxbai) where special tokens have trailing spaces
+            self.query_marker_id = self.tokenizer.token_to_id(query_marker + " ")
+            if self.query_marker_id is not None:
+                query_marker = query_marker + " "
+        self.doc_marker_id = self.tokenizer.token_to_id(doc_marker)
+        if self.doc_marker_id is None:
+            self.doc_marker_id = self.tokenizer.token_to_id(doc_marker + " ")
+            if self.doc_marker_id is not None:
+                doc_marker = doc_marker + " "
+        if self.query_marker_id is None or self.doc_marker_id is None:
+            print(f"[Warning] Custom markers '{query_marker.strip()}'/'{doc_marker.strip()}' not found in vocabulary. Defaulting to standard tokenization.")
+        # Dynamically find all token IDs associated with string punctuation symbols
+        # to construct the punctuation masking skiplist.
+        skiplist_set = get_punctuation_token_ids(
+            vocab=self.tokenizer.get_vocab(),
+            query_marker=query_marker,
+            doc_marker=doc_marker
+        )
+        # Pre-compile the skiplist to a NumPy array for fast vector-optimized masking
+        self.skiplist_arr = np.array(list(skiplist_set), dtype=np.int64)
+    def _prepare_inputs(self, texts: List[str], marker_id: int, max_length: int) -> Dict[str, np.ndarray]:
+        # Lowercase texts if the model is case-insensitive
+        if self.do_lower_case:
+            texts = [t.lower() for t in texts]
+        # Determine the target tokenization length prior to inserting the prefix token
+        token_len = max_length - 1 if marker_id is not None else max_length
+        self.tokenizer.enable_padding(style="max_length", length=token_len)
+        self.tokenizer.enable_truncation(max_length=token_len)
+        encodings = self.tokenizer.encode_batch(texts)
+        input_ids = []
+        attention_masks = []
+        for enc in encodings:
+            ids = list(enc.ids)
+            mask = list(enc.attention_mask)
+            # Insert the ColBERT interaction marker [Q] or [D] right after [CLS] (index 1)
+            if marker_id is not None and len(ids) > 1:
+                ids.insert(1, marker_id)
+                ids = ids[:max_length]
+                mask.insert(1, 1)
+                mask = mask[:max_length]
+            input_ids.append(ids)
+            attention_masks.append(mask)
+        inputs = {
+            "input_ids": np.array(input_ids, dtype=np.int64),
+            "attention_mask": np.array(attention_masks, dtype=np.int64)
+        }
+        # Handle models exported with an optional token_type_ids layer
+        if "token_type_ids" in self.input_names:
+            inputs["token_type_ids"] = np.zeros_like(inputs["input_ids"])
+        return inputs
+    def encode_queries(self, queries: Union[str, List[str]], max_length: int = 32, normalize: bool = True) -> np.ndarray:
+        """
+        Encodes query texts into multi-vector embeddings.
+        Args:
+            queries: A single query string or list of query strings.
+            max_length: Maximum query sequence length (usually 32 for ColBERT).
+            normalize: Whether to apply L2 normalization to the output vectors.
+        Returns:
+            A NumPy array of query embeddings with shape (Batch, Seq_Len, Dim).
+        """
+        if isinstance(queries, str):
+            queries = [queries]
+        onnx_inputs = self._prepare_inputs(queries, self.query_marker_id, max_length)
+        embeddings = self.session.run([self.output_name], onnx_inputs)[0]
+        if normalize:
+            norm = np.linalg.norm(embeddings, axis=-1, keepdims=True)
+            # Optimize in-place division using where filter to avoid zero-division allocation
+            np.divide(embeddings, norm, out=embeddings, where=norm != 0.0)
+        return embeddings
+    def encode_docs(self, docs: Union[str, List[str]], max_length: int = 256, normalize: bool = True) -> np.ndarray:
+        """
+        Encodes document texts into multi-vector embeddings, automatically zeroing out
+        embeddings corresponding to punctuation tokens to reduce index footprint and search noise.
+        Args:
+            docs: A single document string or list of document strings.
+            max_length: Maximum document sequence length (usually 256 for ColBERT).
+            normalize: Whether to apply L2 normalization to the output vectors.
+        Returns:
+            A NumPy array of document embeddings with shape (Batch, Seq_Len, Dim).
+        """
+        if isinstance(docs, str):
+            docs = [docs]
+        onnx_inputs = self._prepare_inputs(docs, self.doc_marker_id, max_length)
+        embeddings = self.session.run([self.output_name], onnx_inputs)[0]
+        # Zero out embeddings for punctuation tokens in the document
+        input_ids = onnx_inputs["input_ids"]
+        # Optimized set membership check using pre-compiled NumPy array
+        mask = np.isin(input_ids, self.skiplist_arr)
+        # Apply the mask via element-wise multiplication (1.0 for words, 0.0 for punctuation)
+        # This executes in-place using continuous memory strides, bypassing index copy overhead
+        keep_mask = (~mask)[:, :, np.newaxis]
+        embeddings *= keep_mask
+        if normalize:
+            norm = np.linalg.norm(embeddings, axis=-1, keepdims=True)
+            np.divide(embeddings, norm, out=embeddings, where=norm != 0.0)
+        return embeddings

intextus/export.py ADDED Viewed

@@ -0,0 +1,107 @@
+import argparse
+import sys
+def main():
+    parser = argparse.ArgumentParser(description="Export a PyTorch ColBERT model to ONNX for intextus.")
+    parser.add_argument("--model", type=str, required=True, help="Hugging Face model ID or path to local PyTorch ColBERT model.")
+    parser.add_argument("--output", type=str, default="model.onnx", help="Path to save the output ONNX model.")
+    parser.add_argument("--tokenizer-output", type=str, default="tokenizer.json", help="Path to save the tokenizer.json file.")
+    args = parser.parse_args()
+    try:
+        import torch
+        import transformers
+    except ImportError:
+        print("Error: PyTorch and Transformers are required for the export utility.")
+        print("Please install them using: pip install torch transformers")
+        sys.exit(1)
+    print(f"Loading ColBERT model from '{args.model}'...")
+    class ColBERTWrapper(torch.nn.Module):
+        def __init__(self, base_model, linear):
+            super().__init__()
+            self.base_model = base_model
+            self.linear = linear
+        def forward(self, input_ids, attention_mask, token_type_ids=None):
+            if token_type_ids is not None:
+                outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
+            else:
+                outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
+            # Use last hidden state
+            last_hidden_state = outputs.last_hidden_state
+            # Apply the custom linear projection layer
+            if self.linear is not None:
+                embeddings = self.linear(last_hidden_state)
+            else:
+                embeddings = last_hidden_state
+            return embeddings
+    # Load model and tokenizer
+    from transformers import AutoModel, AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    model = AutoModel.from_pretrained(args.model, trust_remote_code=True)
+    # Check for linear projection layer
+    linear = None
+    if hasattr(model, "linear"):
+        linear = model.linear
+    elif hasattr(model, "projection"):
+        linear = model.projection
+    elif hasattr(model, "proj"):
+        linear = model.proj
+    elif hasattr(model, "pooler"):
+        pass
+    wrapper = ColBERTWrapper(model, linear)
+    wrapper.eval()
+    # Create dummy inputs
+    dummy_input_ids = torch.ones(1, 32, dtype=torch.long)
+    dummy_attention_mask = torch.ones(1, 32, dtype=torch.long)
+    dummy_token_type_ids = torch.zeros(1, 32, dtype=torch.long)
+    input_names = ["input_ids", "attention_mask"]
+    dynamic_axes = {
+        "input_ids": {0: "batch_size", 1: "sequence_length"},
+        "attention_mask": {0: "batch_size", 1: "sequence_length"},
+        "embeddings": {0: "batch_size", 1: "sequence_length"}
+    }
+    inputs = (dummy_input_ids, dummy_attention_mask)
+    # Check if the base model accepts token_type_ids
+    import inspect
+    sig = inspect.signature(model.forward)
+    if "token_type_ids" in sig.parameters:
+        inputs = (dummy_input_ids, dummy_attention_mask, dummy_token_type_ids)
+        input_names.append("token_type_ids")
+        dynamic_axes["token_type_ids"] = {0: "batch_size", 1: "sequence_length"}
+    print("Exporting model to ONNX...")
+    torch.onnx.export(
+        wrapper,
+        inputs,
+        args.output,
+        input_names=input_names,
+        output_names=["embeddings"],
+        dynamic_axes=dynamic_axes,
+        opset_version=14,
+        do_constant_folding=True
+    )
+    print(f"ONNX model saved successfully to '{args.output}'")
+    # Save tokenizer.json
+    print(f"Saving tokenizer to '{args.tokenizer_output}'...")
+    tokenizer._tokenizer.save(args.tokenizer_output)
+    print("Done!")
+if __name__ == "__main__":
+    main()

intextus/utils.py ADDED Viewed

@@ -0,0 +1,77 @@
+import string
+from typing import Dict, Set
+import numpy as np
+def compute_maxsim(query_embeddings: np.ndarray, doc_embeddings: np.ndarray) -> float:
+    """
+    Computes the late-interaction MaxSim score between query and document vectors.
+    Args:
+        query_embeddings: Array of shape (Query_Tokens, Dim) representing query vector sequence.
+        doc_embeddings: Array of shape (Doc_Tokens, Dim) representing document vector sequence.
+    Returns:
+        The float score representing late-interaction relevance.
+    """
+    # Compute the dot product matrix between every query token and every document token
+    # Resulting shape: (Query_Tokens, Doc_Tokens)
+    scores = np.dot(query_embeddings, doc_embeddings.T)
+    # Take the maximum score across the document tokens for each query token
+    max_scores_per_query_token = np.max(scores, axis=1)
+    # Sum the maximums together to get final relevance score
+    return float(np.sum(max_scores_per_query_token))
+def get_punctuation_token_ids(
+    vocab: Dict[str, int],
+    query_marker: str = "[Q]",
+    doc_marker: str = "[D]"
+) -> Set[int]:
+    """
+    Identifies tokenizer vocabulary IDs that correspond to punctuation marks.
+    This is used to construct a skiplist for document token masking.
+    Args:
+        vocab: Dictionary mapping token strings to their integer IDs.
+        query_marker: Token representing query interaction.
+        doc_marker: Token representing document interaction.
+    Returns:
+        A set of token IDs to be masked/skipped.
+    """
+    punctuation_chars = set(string.punctuation)
+    skiplist_ids = set()
+    # Common prefix/suffix subword markers used by various tokenizers
+    clean_markers = ["##", "Ġ", " ", "</w>"]
+    # Explicitly protect standard control tokens and query/doc markers
+    protected_tokens = {
+        query_marker,
+        doc_marker,
+        "[CLS]", "[SEP]", "[PAD]", "[MASK]", "[UNK]",
+        "<s>", "</s>", "<pad>", "<mask>", "<unk>"
+    }
+    for token, token_id in vocab.items():
+        if token in protected_tokens:
+            continue
+        cleaned = token
+        for marker in clean_markers:
+            cleaned = cleaned.replace(marker, "")
+        # Exclude special/control tokens (usually wrapped in [] or <> and longer than 1 char)
+        if len(token) > 1 and (
+            (token.startswith("[") and token.endswith("]")) or
+            (token.startswith("<") and token.endswith(">"))
+        ):
+            continue
+        # A token is considered punctuation if its cleaned representation consists
+        # entirely of standard punctuation characters (and is not empty).
+        if cleaned and all(char in punctuation_chars for char in cleaned):
+            skiplist_ids.add(token_id)
+    return skiplist_ids

intextus_embed-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,94 @@
+Metadata-Version: 2.4
+Name: intextus-embed
+Version: 0.1.0
+Summary: A lightweight, zero-PyTorch ONNX encoder for generic ColBERT models.
+License: MIT
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: onnxruntime>=1.16.0
+Requires-Dist: tokenizers>=0.19.0
+Requires-Dist: numpy>=1.22.0
+Dynamic: license-file
+# 🕸️ intextus
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
+**intextus** (Latin for *"woven into the text"*) is an ultra-lightweight, 100% PyTorch-free, and production-grade Python library designed to encode late-interaction ColBERT multi-vectors.
+By replacing massive deep learning libraries with highly optimized, compiled C++/Rust backends, **intextus** delivers full ColBERT MaxSim embeddings in **under 65MB of RAM** with **zero PyTorch or Transformers dependencies**. It is optimized for edge devices, serverless functions (AWS Lambda, Cloudflare Workers), and resource-constrained environments.
+---
+## ⚡ Key Features
+- **No PyTorch or Transformers:** Fully decoupled from the heavy standard library pipeline. A simple `pip install` completes in seconds.
+- **Micro Memory Footprint:** Executes multi-vector graphs inside ONNX Runtime, drawing less than 65MB of RAM during inference.
+- **Fast Rust Tokenization:** Uses Hugging Face's raw Rust tokenization backend directly.
+- **Dynamic Punctuation Skiplist:** Dynamically parses `tokenizer.json` at initialization, creating a zero-overhead mask to discard punctuation vectors, matching ColBERT index-saving behaviors.
+- **Standardized Late Interaction:** Exposes native NumPy-based MaxSim calculations.
+---
+## 📦 Installation
+Install the library directly via pip:
+```bash
+pip install intextus-embed
+```
+> [!NOTE]
+> `intextus` currently defaults to highly optimized CPU inference. Full hardware acceleration and GPU execution support are planned for a future release.
+---
+## 🚀 Quick Start
+Here is how to load a model, extract multi-vector embeddings, and compute late-interaction cross-similarity scores entirely in NumPy:
+```python
+from intextus import IntextusEncoder, compute_maxsim
+# Initialize the encoder (defaults to intextus/mxbai-edge-colbert-v0-17m-onnx)
+model = IntextusEncoder()
+# Or initialize from a local directory containing 'model.onnx' and 'tokenizer.json'
+# model = IntextusEncoder("./my_model_directory")
+# Extract query and document embeddings (Batch_Size, Sequence_Length, Dimension)
+query_embeddings = model.encode_queries("What is ultra-low latency?")
+doc_embeddings = model.encode_docs("ONNX runtime bypasses the PyTorch layer completely.")
+# Compute the cross-similarity score via NumPy (using the first item in the batch)
+score = compute_maxsim(query_embeddings[0], doc_embeddings[0])
+print(f"Relevance Score (MaxSim): {score:.4f}")
+```
+---
+## 🎯 Supported & Tested Models
+`intextus` is designed for ultra-fast, edge-compatible ColBERT execution. The primary officially supported and fully validated models are:
+- **`intextus/mxbai-edge-colbert-v0-17m-onnx`** (Alias: `mxbai-edge-colbert-v0-17m`) — A highly-optimized, single-file ONNX representation of ModernBERT-backed `mxbai-edge-colbert-v0-17m` (66 MB, 48-dimensional late-interaction embeddings). **(Default Model)**
+- **`intextus/mxbai-edge-colbert-v0-32m-onnx`** (Alias: `mxbai-edge-colbert-v0-32m`) — A larger, higher-capacity ONNX representation of ModernBERT-backed `mxbai-edge-colbert-v0-32m` (124 MB, 64-dimensional late-interaction embeddings).
+- **`intextus/lateon-onnx`** (Alias: `lateon`) — A high-capacity base ModernBERT-backed model (580 MB, 128-dimensional late-interaction embeddings). Note: LateOn is case-sensitive, so load it with `IntextusEncoder("lateon", do_lower_case=False)`.
+> [!NOTE]
+> Any ColBERT model exported via standard Hugging Face/PyLate workflows can be loaded locally by providing the path to its `model.onnx` and `tokenizer.json`.
+---
+---
+## ⚖️ License
+This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.

intextus_embed-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+intextus/__init__.py,sha256=t6j6n8TW5quU-BH7f05M7ITHv6v4GCUgQ-KtAOnmX2M,120
+intextus/encoder.py,sha256=PuiCw6egJAUTqW5PPrHBpibP189axweRIxdAwe1EFlU,9495
+intextus/export.py,sha256=K8LMl_VnrDXZp8O-xmGx03tdw0auapsnR4t-Ypdwfzo,3942
+intextus/utils.py,sha256=KxpN4KHHREshel0ll5ZA_wf52HQkUFoY-6Jy2uNrgHo,2865
+intextus_embed-0.1.0.dist-info/licenses/LICENSE,sha256=UGbRVzCpgoCCzeyERJ5mvwzBx6fyBv7bPV3foxPQTCM,1073
+intextus_embed-0.1.0.dist-info/METADATA,sha256=dxYQFei3s7AuX3xnXFQ07xIcO7yr3Tph6qbV6ZI1IUY,4341
+intextus_embed-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+intextus_embed-0.1.0.dist-info/entry_points.txt,sha256=E9BpCOeNsO_B6OVvBDko4NuhN47SgsomP0_6psNBh7Y,57
+intextus_embed-0.1.0.dist-info/top_level.txt,sha256=XzKpIJuni5qhoZ_J-BHkV7FeUhkQrdOEuk9EGBFwdMs,9
+intextus_embed-0.1.0.dist-info/RECORD,,

intextus_embed-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

intextus_embed-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ intextus-export = intextus.export:main

intextus_embed-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 intextus Authors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

intextus_embed-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ intextus