PyPI - torch-rechub - Versions diffs - 0.0.6__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

torch-rechub 0.0.6py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

torch_rechub/basic/layers.py +228 -159
torch_rechub/basic/loss_func.py +62 -47
torch_rechub/data/dataset.py +18 -31
torch_rechub/models/generative/hstu.py +48 -33
torch_rechub/serving/__init__.py +50 -0
torch_rechub/serving/annoy.py +133 -0
torch_rechub/serving/base.py +107 -0
torch_rechub/serving/faiss.py +154 -0
torch_rechub/serving/milvus.py +215 -0
torch_rechub/trainers/ctr_trainer.py +12 -2
torch_rechub/trainers/match_trainer.py +13 -2
torch_rechub/trainers/mtl_trainer.py +12 -2
torch_rechub/trainers/seq_trainer.py +34 -15
torch_rechub/types.py +5 -0
torch_rechub/utils/data.py +191 -145
torch_rechub/utils/hstu_utils.py +87 -76
torch_rechub/utils/model_utils.py +10 -12
torch_rechub/utils/onnx_export.py +98 -45
torch_rechub/utils/quantization.py +128 -0
torch_rechub/utils/visualization.py +4 -12
{torch_rechub-0.0.6.dist-info → torch_rechub-0.2.0.dist-info}/METADATA +34 -18
{torch_rechub-0.0.6.dist-info → torch_rechub-0.2.0.dist-info}/RECORD +24 -18
torch_rechub/trainers/matching.md +0 -3
{torch_rechub-0.0.6.dist-info → torch_rechub-0.2.0.dist-info}/WHEEL +0 -0
{torch_rechub-0.0.6.dist-info → torch_rechub-0.2.0.dist-info}/licenses/LICENSE +0 -0

torch_rechub/basic/loss_func.py CHANGED Viewed

@@ -4,13 +4,24 @@ import torch.nn as nn
 class RegularizationLoss(nn.Module):
-    """Unified L1/L2 Regularization Loss for embedding and dense parameters.
-    Example:
-        >>> reg_loss_fn = RegularizationLoss(embedding_l2=1e-5, dense_l2=1e-5)
-        >>> # In model's forward or trainer
-        >>> reg_loss = reg_loss_fn(model)
-        >>> total_loss = task_loss + reg_loss
+    """Unified L1/L2 regularization for embedding and dense parameters.
+    Parameters
+    ----------
+    embedding_l1 : float, default=0.0
+        L1 coefficient for embedding parameters.
+    embedding_l2 : float, default=0.0
+        L2 coefficient for embedding parameters.
+    dense_l1 : float, default=0.0
+        L1 coefficient for dense (non-embedding) parameters.
+    dense_l2 : float, default=0.0
+        L2 coefficient for dense (non-embedding) parameters.
+    Examples
+    --------
+    >>> reg_loss_fn = RegularizationLoss(embedding_l2=1e-5, dense_l2=1e-5)
+    >>> reg_loss = reg_loss_fn(model)
+    >>> total_loss = task_loss + reg_loss
     """
     def __init__(self, embedding_l1=0.0, embedding_l2=0.0, dense_l1=0.0, dense_l2=0.0):
@@ -58,9 +69,11 @@ class RegularizationLoss(nn.Module):
 class HingeLoss(torch.nn.Module):
-    """Hinge Loss for pairwise learning.
-    reference: https://github.com/ustcml/RecStudio/blob/main/recstudio/model/loss_func.py
+    """Hinge loss for pairwise learning.
+    Notes
+    -----
+    Reference: https://github.com/ustcml/RecStudio/blob/main/recstudio/model/loss_func.py
     """
     def __init__(self, margin=2, num_items=None):
@@ -89,27 +102,28 @@ class BPRLoss(torch.nn.Module):
 class NCELoss(torch.nn.Module):
-    """Noise Contrastive Estimation (NCE) Loss for recommendation systems.
-    NCE Loss is more efficient than CrossEntropyLoss for large-scale recommendation
-    scenarios. It uses in-batch negatives to reduce computational complexity.
-    Reference:
-        - Noise-contrastive estimation: A new estimation principle for unnormalized
-          statistical models (Gutmann & Hyvärinen, 2010)
-        - HLLM: Hierarchical Large Language Model for Recommendation
-    Args:
-        temperature (float): Temperature parameter for scaling logits. Default: 1.0
-        ignore_index (int): Index to ignore in loss computation. Default: 0
-        reduction (str): Specifies the reduction to apply to the output.
-                        Options: 'mean', 'sum', 'none'. Default: 'mean'
-    Example:
-        >>> nce_loss = NCELoss(temperature=0.1)
-        >>> logits = torch.randn(32, 1000)  # (batch_size, vocab_size)
-        >>> targets = torch.randint(0, 1000, (32,))
-        >>> loss = nce_loss(logits, targets)
+    """Noise Contrastive Estimation (NCE) loss for recommender systems.
+    Parameters
+    ----------
+    temperature : float, default=1.0
+        Temperature for scaling logits.
+    ignore_index : int, default=0
+        Target index to ignore.
+    reduction : {'mean', 'sum', 'none'}, default='mean'
+        Reduction applied to the output.
+    Notes
+    -----
+    - Gutmann & Hyvärinen (2010), Noise-contrastive estimation.
+    - HLLM: Hierarchical Large Language Model for Recommendation.
+    Examples
+    --------
+    >>> nce_loss = NCELoss(temperature=0.1)
+    >>> logits = torch.randn(32, 1000)
+    >>> targets = torch.randint(0, 1000, (32,))
+    >>> loss = nce_loss(logits, targets)
     """
     def __init__(self, temperature=1.0, ignore_index=0, reduction='mean'):
@@ -158,23 +172,24 @@ class NCELoss(torch.nn.Module):
 class InBatchNCELoss(torch.nn.Module):
-    """In-Batch NCE Loss with explicit negative sampling.
-    This loss function uses other samples in the batch as negative samples,
-    which is more efficient than sampling random negatives.
-    Args:
-        temperature (float): Temperature parameter for scaling logits. Default: 0.1
-        ignore_index (int): Index to ignore in loss computation. Default: 0
-        reduction (str): Specifies the reduction to apply to the output.
-                        Options: 'mean', 'sum', 'none'. Default: 'mean'
-    Example:
-        >>> loss_fn = InBatchNCELoss(temperature=0.1)
-        >>> embeddings = torch.randn(32, 256)  # (batch_size, embedding_dim)
-        >>> item_embeddings = torch.randn(1000, 256)  # (vocab_size, embedding_dim)
-        >>> targets = torch.randint(0, 1000, (32,))
-        >>> loss = loss_fn(embeddings, item_embeddings, targets)
+    """In-batch NCE loss with explicit negatives.
+    Parameters
+    ----------
+    temperature : float, default=0.1
+        Temperature for scaling logits.
+    ignore_index : int, default=0
+        Target index to ignore.
+    reduction : {'mean', 'sum', 'none'}, default='mean'
+        Reduction applied to the output.
+    Examples
+    --------
+    >>> loss_fn = InBatchNCELoss(temperature=0.1)
+    >>> embeddings = torch.randn(32, 256)
+    >>> item_embeddings = torch.randn(1000, 256)
+    >>> targets = torch.randint(0, 1000, (32,))
+    >>> loss = loss_fn(embeddings, item_embeddings, targets)
     """
     def __init__(self, temperature=0.1, ignore_index=0, reduction='mean'):

torch_rechub/data/dataset.py CHANGED Viewed

@@ -1,40 +1,35 @@
 """Dataset implementations providing streaming, batch-wise data access for PyTorch."""
-import os
 import typing as ty
 import pyarrow.dataset as pd
 import torch
 from torch.utils.data import IterableDataset, get_worker_info
-from .convert import pa_array_to_tensor
+from torch_rechub.types import FilePath
-# Type for path to a file
-_FilePath = ty.Union[str, os.PathLike]
+from .convert import pa_array_to_tensor
 # The default batch size when reading a Parquet dataset
 _DEFAULT_BATCH_SIZE = 1024
 class ParquetIterableDataset(IterableDataset):
-    """
-    IterableDataset that streams data from one or more Parquet files.
+    """Stream Parquet data as PyTorch tensors.
     Parameters
     ----------
-    file_paths : list[_FilePath]
+    file_paths : list[FilePath]
         Paths to Parquet files.
     columns : list[str], optional
-        Column names to select. If ``None``, all columns are read.
-    batch_size : int, default DEFAULT_BATCH_SIZE
-        Number of rows per streamed batch.
+        Columns to select; if ``None``, read all columns.
+    batch_size : int, default _DEFAULT_BATCH_SIZE
+        Rows per streamed batch.
     Notes
     -----
-    This dataset reads data lazily and never loads the entire Parquet dataset to memory.
-    The current worker receives a partition of ``file_paths`` and builds its own PyArrow
-    Dataset and Scanner. Iteration yields dictionaries mapping column names to PyTorch
-    tensors created via NumPy, one batch at a time.
+    Reads lazily; no full Parquet load. Each worker gets a partition, builds its
+    own PyArrow Dataset/Scanner, and yields dicts of column tensors batch by batch.
     Examples
     --------
@@ -44,16 +39,14 @@ class ParquetIterableDataset(IterableDataset):
     ...     batch_size=1024,
     ... )
     >>> loader = DataLoader(ds, batch_size=None)
-    >>> # Now iterate over batches.
     >>> for batch in loader:
     ...     x, y, label = batch["x"], batch["y"], batch["label"]
-    ...     # Do some work.
     ...     ...
     """
     def __init__(
         self,
-        file_paths: ty.Sequence[_FilePath],
+        file_paths: ty.Sequence[FilePath],
         /,
         columns: ty.Optional[ty.Sequence[str]] = None,
         batch_size: int = _DEFAULT_BATCH_SIZE,
@@ -64,17 +57,15 @@ class ParquetIterableDataset(IterableDataset):
         self._batch_size = batch_size
     def __iter__(self) -> ty.Iterator[dict[str, torch.Tensor]]:
-        """
-        Stream Parquet data as mapped PyTorch tensors.
+        """Stream Parquet data as mapped PyTorch tensors.
-        Build a PyArrow Dataset from the current worker's assigned file partition, then
-        create a Scanner to lazily read batches of the selected columns. Each batch is
-        converted to a dict mapping column names to PyTorch tensors (via NumPy).
+        Builds a PyArrow Dataset from the current worker's file partition, then
+        lazily scans selected columns. Each batch becomes a dict of Torch tensors.
         Returns
         -------
         Iterator[dict[str, torch.Tensor]]
-            An iterator that yields one converted batch at a time.
+            One converted batch at a time.
         """
         if not (partition := self._get_partition()):
             return
@@ -95,19 +86,15 @@ class ParquetIterableDataset(IterableDataset):
     # private interfaces
     def _get_partition(self) -> tuple[str, ...]:
-        """
-        Get the partition of file paths for the current worker.
-        This method splits the full list of file paths into contiguous partitions with
-        a nearly equal size by the total number of workers and the current worker ID.
+        """Get file partition for the current worker.
-        If running in the main process (i.e., no worker information is available), the
-        entire list of file paths is returned.
+        Splits file paths into contiguous partitions by number of workers and worker ID.
+        In the main process (no worker info), returns all paths.
         Returns
         -------
         tuple[str, ...]
-            The partition of file paths for the current worker.
+            Partition of file paths for this worker.
         """
         if (info := get_worker_info()) is None:
             return self._file_paths

torch_rechub/models/generative/hstu.py CHANGED Viewed

@@ -10,39 +10,54 @@ from torch_rechub.utils.hstu_utils import RelPosBias
 class HSTUModel(nn.Module):
-    """HSTU: Hierarchical Sequential Transduction Units model.
-    Autoregressive generative recommendation model for sequential data.
-    This module stacks multiple ``HSTUBlock`` layers to capture long-range
-    dependencies in user interaction sequences and predicts the next item.
-    Args:
-        vocab_size (int): Vocabulary size (number of distinct items, including PAD).
-        d_model (int): Hidden dimension of the model. Default: 512.
-        n_heads (int): Number of attention heads. Default: 8.
-        n_layers (int): Number of stacked HSTU layers. Default: 4.
-        dqk (int): Dimension of query/key vectors per head. Default: 64.
-        dv (int): Dimension of value vectors per head. Default: 64.
-        max_seq_len (int): Maximum sequence length. Default: 256.
-        dropout (float): Dropout rate applied in the model. Default: 0.1.
-        use_rel_pos_bias (bool): Whether to use relative position bias. Default: True.
-        use_time_embedding (bool): Whether to use time-difference embeddings. Default: True.
-        num_time_buckets (int): Number of time buckets for time embeddings. Default: 2048.
-        time_bucket_fn (str): Function used to bucketize time differences, ``"sqrt"``
-            or ``"log"``. Default: ``"sqrt"``.
-    Shape:
-        - Input: ``x`` of shape ``(batch_size, seq_len)``; optional ``time_diffs``
-          of shape ``(batch_size, seq_len)`` representing time differences in seconds.
-        - Output: Logits of shape ``(batch_size, seq_len, vocab_size)``.
-    Example:
-        >>> model = HSTUModel(vocab_size=100000, d_model=512)
-        >>> x = torch.randint(0, 100000, (32, 256))
-        >>> time_diffs = torch.randint(0, 86400, (32, 256))
-        >>> logits = model(x, time_diffs)
-        >>> logits.shape
-        torch.Size([32, 256, 100000])
+    """HSTU: Hierarchical Sequential Transduction Units.
+    Autoregressive generative recommender that stacks ``HSTUBlock`` layers to
+    capture long-range dependencies and predict the next item.
+    Parameters
+    ----------
+    vocab_size : int
+        Vocabulary size (items incl. PAD).
+    d_model : int, default=512
+        Hidden dimension.
+    n_heads : int, default=8
+        Attention heads.
+    n_layers : int, default=4
+        Number of stacked HSTU layers.
+    dqk : int, default=64
+        Query/key dim per head.
+    dv : int, default=64
+        Value dim per head.
+    max_seq_len : int, default=256
+        Maximum sequence length.
+    dropout : float, default=0.1
+        Dropout rate.
+    use_rel_pos_bias : bool, default=True
+        Use relative position bias.
+    use_time_embedding : bool, default=True
+        Use time-difference embeddings.
+    num_time_buckets : int, default=2048
+        Number of time buckets for time embeddings.
+    time_bucket_fn : {'sqrt', 'log'}, default='sqrt'
+        Bucketization function for time differences.
+    Shape
+    -----
+    Input
+        x : ``(batch_size, seq_len)``
+        time_diffs : ``(batch_size, seq_len)``, optional (seconds).
+    Output
+        logits : ``(batch_size, seq_len, vocab_size)``
+    Examples
+    --------
+    >>> model = HSTUModel(vocab_size=100000, d_model=512)
+    >>> x = torch.randint(0, 100000, (32, 256))
+    >>> time_diffs = torch.randint(0, 86400, (32, 256))
+    >>> logits = model(x, time_diffs)
+    >>> logits.shape
+    torch.Size([32, 256, 100000])
     """
     def __init__(self, vocab_size, d_model=512, n_heads=8, n_layers=4, dqk=64, dv=64, max_seq_len=256, dropout=0.1, use_rel_pos_bias=True, use_time_embedding=True, num_time_buckets=2048, time_bucket_fn='sqrt'):

torch_rechub/serving/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+import typing as ty
+from .annoy import AnnoyBuilder
+from .base import BaseBuilder
+from .faiss import FaissBuilder
+from .milvus import MilvusBuilder
+# Type for supported retrieval models.
+_RetrievalModel = ty.Literal["annoy", "faiss", "milvus"]
+def builder_factory(model: _RetrievalModel, **builder_config) -> BaseBuilder:
+    """
+    Factory function for creating a vector index builder.
+    This function instantiates and returns a concrete implementation of ``BaseBuilder``
+    based on the specified retrieval backend. The returned builder is responsible for
+    constructing or loading the underlying ANN index via its own ``from_embeddings`` or
+    ``from_index_file`` method.
+    Parameters
+    ----------
+    model : "annoy", "faiss", or "milvus"
+        The retrieval backend to use.
+    **builder_config
+        Keyword arguments passed directly to the selected builder constructor.
+    Returns
+    -------
+    BaseBuilder
+        A concrete builder instance corresponding to the specified retrieval backend.
+    Raises
+    ------
+    NotImplementedError
+        if the specified retrieval model is not supported.
+    """
+    if model == "annoy":
+        return AnnoyBuilder(**builder_config)
+    if model == "faiss":
+        return FaissBuilder(**builder_config)
+    if model == "milvus":
+        return MilvusBuilder(**builder_config)
+    raise NotImplementedError(f"{model=} is not implemented yet!")
+__all__ = ["builder_factory"]

torch_rechub/serving/annoy.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""ANNOY-based vector index implementation for the retrieval stage."""
+import contextlib
+import typing as ty
+import annoy
+import numpy as np
+import torch
+from torch_rechub.types import FilePath
+from .base import BaseBuilder, BaseIndexer
+# Type for distance metrics for the ANNOY index.
+_AnnoyMetric = ty.Literal["angular", "euclidean", "dot"]
+# Default distance metric used by ANNOY.
+_DEFAULT_METRIC: _AnnoyMetric = "angular"
+# Default number of trees to build in the ANNOY index.
+_DEFAULT_N_TREES = 10
+# Default number of worker threads for building the ANNOY index.
+_DEFAULT_THREADS = -1
+# Default number of nodes to inspect during an ANNOY search.
+_DEFAULT_SEARCHK = -1
+class AnnoyBuilder(BaseBuilder):
+    """ANNOY-based implementation of ``BaseBuilder``."""
+    def __init__(
+        self,
+        d: int,
+        metric: _AnnoyMetric = _DEFAULT_METRIC,
+        *,
+        n_trees: int = _DEFAULT_N_TREES,
+        threads: int = _DEFAULT_THREADS,
+        searchk: int = _DEFAULT_SEARCHK,
+    ) -> None:
+        """
+        Initialize a ANNOY builder.
+        Parameters
+        ----------
+        d : int
+            The dimension of embeddings.
+        metric : ``"angular"``, ``"euclidean"``, or ``"dot"``, optional
+            The indexing metric. Default to ``"angular"``.
+        n_trees : int, optional
+            Number of trees to build an ANNOY index.
+        threads : int, optional
+            Number of worker threads to build an ANNOY index.
+        searchk : int, optional
+            Number of nodes to inspect during an ANNOY search.
+        """
+        self._d = d
+        self._metric = metric
+        self._n_trees = n_trees
+        self._threads = threads
+        self._searchk = searchk
+    @contextlib.contextmanager
+    def from_embeddings(
+        self,
+        embeddings: torch.Tensor,
+    ) -> ty.Generator["AnnoyIndexer",
+                      None,
+                      None]:
+        """Adhere to ``BaseBuilder.from_embeddings``."""
+        index = annoy.AnnoyIndex(self._d, metric=self._metric)
+        for idx, emb in enumerate(embeddings):
+            index.add_item(idx, emb)
+        index.build(self._n_trees, n_jobs=self._threads)
+        try:
+            yield AnnoyIndexer(index, self._searchk)
+        finally:
+            index.unload()
+    @contextlib.contextmanager
+    def from_index_file(
+        self,
+        index_file: FilePath,
+    ) -> ty.Generator["AnnoyIndexer",
+                      None,
+                      None]:
+        """Adhere to ``BaseBuilder.from_index_file``."""
+        index = annoy.AnnoyIndex(self._d, metric=self._metric)
+        index.load(str(index_file))
+        try:
+            yield AnnoyIndexer(index, searchk=self._searchk)
+        finally:
+            index.unload()
+class AnnoyIndexer(BaseIndexer):
+    """ANNOY-based implementation of ``BaseIndexer``."""
+    def __init__(self, index: annoy.AnnoyIndex, searchk: int) -> None:
+        """Initialize a ANNOY indexer."""
+        self._index = index
+        self._searchk = searchk
+    def query(
+        self,
+        embeddings: torch.Tensor,
+        top_k: int,
+    ) -> tuple[torch.Tensor,
+               torch.Tensor]:
+        """Adhere to ``BaseIndexer.query``."""
+        n, _ = embeddings.shape
+        nn_ids = np.zeros((n, top_k), dtype=np.int64)
+        nn_distances = np.zeros((n, top_k), dtype=np.float32)
+        for idx, emb in enumerate(embeddings):
+            nn_ids[idx], nn_distances[idx] = self._index.get_nns_by_vector(
+                emb.cpu().numpy(),
+                top_k,
+                search_k=self._searchk,
+                include_distances=True,
+            )
+        return torch.from_numpy(nn_ids), torch.from_numpy(nn_distances)
+    def save(self, file_path: FilePath) -> None:
+        """Adhere to ``BaseIndexer.save``."""
+        self._index.save(str(file_path))

torch_rechub/serving/base.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""Base abstraction for vector indexers used in the retrieval stage."""
+import abc
+import typing as ty
+import torch
+from torch_rechub.types import FilePath
+class BaseBuilder(abc.ABC):
+    """
+    Abstract base class for vector index construction.
+    A builder owns all build-time configuration and produces a ``BaseIndexer`` through a
+    context-managed build operation.
+    Examples
+    --------
+    >>> builder = BaseBuilder(...)
+    >>> embeddings = torch.randn(1000, 128)
+    >>> with builder.from_embeddings(embeddings) as indexer:
+    ...     ids, scores = indexer.query(embeddings[:2], top_k=5)
+    ...     indexer.save("index.bin")
+    >>> with builder.from_index_file("index.bin") as indexer:
+    ...     ids, scores = indexer.query(embeddings[:2], top_k=5)
+    """
+    @abc.abstractmethod
+    def from_embeddings(
+        self,
+        embeddings: torch.Tensor,
+    ) -> ty.ContextManager["BaseIndexer"]:
+        """
+        Build a vector index from the embeddings.
+        Parameters
+        ----------
+        embeddings : torch.Tensor
+            A 2D tensor (n, d) containing embedding vectors to build a new index.
+        Returns
+        -------
+        ContextManager[BaseIndexer]
+            A context manager that yields a fully initialized ``BaseIndexer``.
+        """
+    @abc.abstractmethod
+    def from_index_file(
+        self,
+        index_file: FilePath,
+    ) -> ty.ContextManager["BaseIndexer"]:
+        """
+        Build a vector index from the index file.
+        Parameters
+        ----------
+        index_file : FilePath
+            Path to a serialized index on disk to be loaded.
+        Returns
+        -------
+        ContextManager[BaseIndexer]
+            A context manager that yields a fully initialized ``BaseIndexer``.
+        """
+class BaseIndexer(abc.ABC):
+    """Abstract base class for vector indexers in the retrieval stage."""
+    @abc.abstractmethod
+    def query(
+        self,
+        embeddings: torch.Tensor,
+        top_k: int,
+    ) -> tuple[torch.Tensor,
+               torch.Tensor]:
+        """
+        Query the vector index.
+        Parameters
+        ----------
+        embeddings : torch.Tensor
+            A 2D tensor (n, d) containing embedding vectors to query the index.
+        top_k : int
+            The number of nearest items to retrieve for each vector.
+        Returns
+        -------
+        torch.Tensor
+            A 2D tensor of shape (n, top_k), containing the retrieved nearest neighbor
+            IDs for each vector, ordered by descending relevance.
+        torch.Tensor
+            A 2D tensor of shape (n, top_k), containing the relevance distances of the
+            nearest neighbors for each vector.
+        """
+    @abc.abstractmethod
+    def save(self, file_path: FilePath) -> None:
+        """
+        Persist the index to local disk.
+        Parameters
+        ----------
+        file_path : FilePath
+            Destination path where the index will be saved.
+        """

torch-rechub 0.0.6__py3-none-any.whl → 0.2.0__py3-none-any.whl

torch-rechub 0.0.6py3-none-any.whl → 0.2.0py3-none-any.whl