PyPI - torch-rechub - Versions diffs - 0.0.6__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

torch-rechub 0.0.6py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

torch_rechub/basic/layers.py +228 -159
torch_rechub/basic/loss_func.py +62 -47
torch_rechub/data/dataset.py +18 -31
torch_rechub/models/generative/hstu.py +48 -33
torch_rechub/serving/__init__.py +50 -0
torch_rechub/serving/annoy.py +133 -0
torch_rechub/serving/base.py +107 -0
torch_rechub/serving/faiss.py +154 -0
torch_rechub/serving/milvus.py +215 -0
torch_rechub/trainers/ctr_trainer.py +12 -2
torch_rechub/trainers/match_trainer.py +13 -2
torch_rechub/trainers/mtl_trainer.py +12 -2
torch_rechub/trainers/seq_trainer.py +34 -15
torch_rechub/types.py +5 -0
torch_rechub/utils/data.py +191 -145
torch_rechub/utils/hstu_utils.py +87 -76
torch_rechub/utils/model_utils.py +10 -12
torch_rechub/utils/onnx_export.py +98 -45
torch_rechub/utils/quantization.py +128 -0
torch_rechub/utils/visualization.py +4 -12
{torch_rechub-0.0.6.dist-info → torch_rechub-0.2.0.dist-info}/METADATA +34 -18
{torch_rechub-0.0.6.dist-info → torch_rechub-0.2.0.dist-info}/RECORD +24 -18
torch_rechub/trainers/matching.md +0 -3
{torch_rechub-0.0.6.dist-info → torch_rechub-0.2.0.dist-info}/WHEEL +0 -0
{torch_rechub-0.0.6.dist-info → torch_rechub-0.2.0.dist-info}/licenses/LICENSE +0 -0

torch_rechub/utils/data.py CHANGED Viewed

@@ -82,57 +82,67 @@ class DataGenerator(object):
 def get_auto_embedding_dim(num_classes):
-    """ Calculate the dim of embedding vector according to number of classes in the category
-    emb_dim = [6 * (num_classes)^(1/4)]
-    reference: Deep & Cross Network for Ad Click Predictions.(ADKDD'17)
-    Args:
-        num_classes: number of classes in the category
-    Returns:
-        the dim of embedding vector
+    """Calculate embedding dim by category size.
+    Uses ``emb_dim = floor(6 * num_classes**0.25)`` from DCN (ADKDD'17).
+    Parameters
+    ----------
+    num_classes : int
+        Number of categorical classes.
+    Returns
+    -------
+    int
+        Recommended embedding dimension.
     """
     return int(np.floor(6 * np.pow(num_classes, 0.25)))
 def get_loss_func(task_type="classification"):
+    """Return default loss by task type."""
     if task_type == "classification":
         return torch.nn.BCELoss()
-    elif task_type == "regression":
+    if task_type == "regression":
         return torch.nn.MSELoss()
-    else:
-        raise ValueError("task_type must be classification or regression")
+    raise ValueError("task_type must be classification or regression")
 def get_metric_func(task_type="classification"):
+    """Return default metric by task type."""
     if task_type == "classification":
         return roc_auc_score
-    elif task_type == "regression":
+    if task_type == "regression":
         return mean_squared_error
-    else:
-        raise ValueError("task_type must be classification or regression")
+    raise ValueError("task_type must be classification or regression")
 def generate_seq_feature(data, user_col, item_col, time_col, item_attribute_cols=[], min_item=0, shuffle=True, max_len=50):
-    """generate sequence feature and negative sample for ranking.
-    Args:
-        data (pd.DataFrame): the raw data.
-        user_col (str): the col name of user_id
-        item_col (str): the col name of item_id
-        time_col (str): the col name of timestamp
-        item_attribute_cols (list[str], optional): the other attribute cols of item which you want to generate sequence feature. Defaults to `[]`.
-        sample_method (int, optional): the negative sample method `{
-            0: "random sampling",
-            1: "popularity sampling method used in word2vec",
-            2: "popularity sampling method by `log(count+1)+1e-6`",
-            3: "tencent RALM sampling"}`.
-            Defaults to 0.
-        min_item (int, optional): the min item each user must have. Defaults to 0.
-        shuffle (bool, optional): shulle if True
-        max_len (int, optional): the max length of a user history sequence.
-    Returns:
-        pd.DataFrame: split train, val and test data with sequence features by time.
+    """Generate sequence features and negatives for ranking.
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Raw interaction data.
+    user_col : str
+        User id column name.
+    item_col : str
+        Item id column name.
+    time_col : str
+        Timestamp column name.
+    item_attribute_cols : list[str], optional
+        Additional item attribute columns to include in sequences.
+    min_item : int, default=0
+        Minimum items per user; users below are dropped.
+    shuffle : bool, default=True
+        Shuffle train/val/test.
+    max_len : int, default=50
+        Max history length.
+    Returns
+    -------
+    tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
+        Train, validation, and test data with sequence features.
     """
     for feat in data:
         le = LabelEncoder()
@@ -205,12 +215,17 @@ def generate_seq_feature(data, user_col, item_col, time_col, item_attribute_cols
 def df_to_dict(data):
-    """
-    Convert the DataFrame to a dict type input that the network can accept
-    Args:
-        data (pd.DataFrame): datasets of type DataFrame
-    Returns:
-        The converted dict, which can be used directly into the input network
+    """Convert DataFrame to dict inputs accepted by models.
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Input dataframe.
+    Returns
+    -------
+    dict
+        Mapping of column name to numpy array.
     """
     data_dict = data.to_dict('list')
     for key in data.keys():
@@ -226,20 +241,28 @@ def neg_sample(click_hist, item_size):
 def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.):
-    """ Pads sequences (list of list) to the ndarray of same length.
-        This is an equivalent implementation of tf.keras.preprocessing.sequence.pad_sequences
-        reference: https://github.com/huawei-noah/benchmark/tree/main/FuxiCTR/fuxictr
-    Args:
-        sequences (pd.DataFrame): data that needs to pad or truncate
-        maxlen (int): maximum sequence length. Defaults to None.
-        dtype (str, optional): Defaults to 'int32'.
-        padding (str, optional): if len(sequences) less than maxlen, padding style, {'pre', 'post'}. Defaults to 'pre'.
-        truncating (str, optional): if len(sequences) more than maxlen, truncate style, {'pre', 'post'}. Defaults to 'pre'.
-        value (_type_, optional): Defaults to 0..
-    Returns:
-        _type_: _description_
+    """Pad list-of-lists sequences to equal length.
+    Equivalent to ``tf.keras.preprocessing.sequence.pad_sequences``.
+    Parameters
+    ----------
+    sequences : Sequence[Sequence]
+        Input sequences.
+    maxlen : int, optional
+        Maximum length; computed if None.
+    dtype : str, default='int32'
+    padding : {'pre', 'post'}, default='pre'
+        Padding direction.
+    truncating : {'pre', 'post'}, default='pre'
+        Truncation direction.
+    value : float, default=0.0
+        Padding value.
+    Returns
+    -------
+    np.ndarray
+        Padded array of shape (n_samples, maxlen).
     """
     assert padding in ["pre", "post"], "Invalid padding={}.".format(padding)
@@ -265,13 +288,19 @@ def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncati
 def array_replace_with_dict(array, dic):
-    """Replace values in NumPy array based on dictionary.
-    Args:
-        array (np.array): a numpy array
-        dic (dict): a map dict
-    Returns:
-        np.array: array with replace
+    """Replace values in numpy array using a mapping dict.
+    Parameters
+    ----------
+    array : np.ndarray
+        Input array.
+    dic : dict
+        Mapping from old to new values.
+    Returns
+    -------
+    np.ndarray
+        Array with values replaced.
     """
     # Extract out keys and values
     k = np.array(list(dic.keys()))
@@ -284,19 +313,25 @@ def array_replace_with_dict(array, dic):
 # Temporarily reserved for testing purposes(1985312383@qq.com)
 def create_seq_features(data, seq_feature_col=['item_id', 'cate_id'], max_len=50, drop_short=3, shuffle=True):
-    """Build a sequence of user's history by time.
-    Args:
-        data (pd.DataFrame): must contain keys: `user_id, item_id, cate_id, time`.
-        seq_feature_col (list): specify the column name that needs to generate sequence features, and its sequence features will be generated according to userid.
-        max_len (int): the max length of a user history sequence.
-        drop_short (int): remove some inactive user who's sequence length < drop_short.
-        shuffle (bool): shuffle data if true.
-    Returns:
-        train (pd.DataFrame): target item will be each item before last two items.
-        val (pd.DataFrame): target item is the second to last item of user's history sequence.
-        test (pd.DataFrame): target item is the last item of user's history sequence.
+    """Build user history sequences by time.
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Must contain ``user_id, item_id, cate_id, time``.
+    seq_feature_col : list, default ['item_id', 'cate_id']
+        Columns to generate sequence features.
+    max_len : int, default=50
+        Max history length.
+    drop_short : int, default=3
+        Drop users with sequence length < drop_short.
+    shuffle : bool, default=True
+        Shuffle outputs.
+    Returns
+    -------
+    tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
+        Train/val/test splits with sequence features.
     """
     for feat in data:
         le = LabelEncoder()
@@ -357,30 +392,32 @@ def create_seq_features(data, seq_feature_col=['item_id', 'cate_id'], max_len=50
 class SeqDataset(Dataset):
-    """Sequence dataset for HSTU-style generative models.
-    This class wraps precomputed sequence features for next-item prediction
-    tasks, including tokens, positions, time differences and targets.
-    Args:
-        seq_tokens (np.ndarray): Token ids of shape ``(num_samples, seq_len)``.
-        seq_positions (np.ndarray): Position indices of shape
-            ``(num_samples, seq_len)``.
-        targets (np.ndarray): Target token ids of shape ``(num_samples,)``.
-        seq_time_diffs (np.ndarray): Time-difference features of shape
-            ``(num_samples, seq_len)``.
-    Shape:
-        - Output: A tuple ``(seq_tokens, seq_positions, seq_time_diffs, target)``.
-    Example:
-        >>> seq_tokens = np.random.randint(0, 1000, (100, 256))
-        >>> seq_positions = np.arange(256)[np.newaxis, :].repeat(100, axis=0)
-        >>> seq_time_diffs = np.random.randint(0, 86400, (100, 256))
-        >>> targets = np.random.randint(0, 1000, (100,))
-        >>> dataset = SeqDataset(seq_tokens, seq_positions, targets, seq_time_diffs)
-        >>> len(dataset)
-        100
+    """Sequence dataset for HSTU-style next-item prediction.
+    Parameters
+    ----------
+    seq_tokens : np.ndarray
+        Token ids, shape ``(num_samples, seq_len)``.
+    seq_positions : np.ndarray
+        Position indices, shape ``(num_samples, seq_len)``.
+    targets : np.ndarray
+        Target token ids, shape ``(num_samples,)``.
+    seq_time_diffs : np.ndarray
+        Time-difference features, shape ``(num_samples, seq_len)``.
+    Shape
+    -----
+    Output tuple: ``(seq_tokens, seq_positions, seq_time_diffs, target)``
+    Examples
+    --------
+    >>> seq_tokens = np.random.randint(0, 1000, (100, 256))
+    >>> seq_positions = np.arange(256)[np.newaxis, :].repeat(100, axis=0)
+    >>> seq_time_diffs = np.random.randint(0, 86400, (100, 256))
+    >>> targets = np.random.randint(0, 1000, (100,))
+    >>> dataset = SeqDataset(seq_tokens, seq_positions, targets, seq_time_diffs)
+    >>> len(dataset)
+    100
     """
     def __init__(self, seq_tokens, seq_positions, targets, seq_time_diffs):
@@ -414,29 +451,25 @@ class SeqDataset(Dataset):
 class SequenceDataGenerator(object):
-    """Sequence data generator used for HSTU-style models.
-    This helper wraps a :class:`SeqDataset` and provides convenient utilities
-    to construct train/val/test ``DataLoader`` objects.
-    Args:
-        seq_tokens (np.ndarray): Token ids of shape ``(num_samples, seq_len)``.
-        seq_positions (np.ndarray): Position indices of shape
-            ``(num_samples, seq_len)``.
-        targets (np.ndarray): Target token ids of shape ``(num_samples,)``.
-        seq_time_diffs (np.ndarray): Time-difference features of shape
-            ``(num_samples, seq_len)``.
-    Methods:
-        generate_dataloader: Build train/val/test data loaders.
-    Example:
-        >>> seq_tokens = np.random.randint(0, 1000, (1000, 256))
-        >>> seq_positions = np.arange(256)[np.newaxis, :].repeat(1000, axis=0)
-        >>> seq_time_diffs = np.random.randint(0, 86400, (1000, 256))
-        >>> targets = np.random.randint(0, 1000, (1000,))
-        >>> gen = SequenceDataGenerator(seq_tokens, seq_positions, targets, seq_time_diffs)
-        >>> train_loader, val_loader, test_loader = gen.generate_dataloader(batch_size=32)
+    """Sequence data generator for HSTU-style models.
+    Wraps :class:`SeqDataset` and builds train/val/test loaders.
+    Parameters
+    ----------
+    seq_tokens : np.ndarray
+        Token ids, shape ``(num_samples, seq_len)``.
+    seq_positions : np.ndarray
+        Position indices, shape ``(num_samples, seq_len)``.
+    targets : np.ndarray
+        Target token ids, shape ``(num_samples,)``.
+    seq_time_diffs : np.ndarray
+        Time-difference features, shape ``(num_samples, seq_len)``.
+    Examples
+    --------
+    >>> gen = SequenceDataGenerator(seq_tokens, seq_positions, targets, seq_time_diffs)
+    >>> train_loader, val_loader, test_loader = gen.generate_dataloader(batch_size=32)
     """
     def __init__(self, seq_tokens, seq_positions, targets, seq_time_diffs):
@@ -449,44 +482,57 @@ class SequenceDataGenerator(object):
         # Underlying dataset
         self.dataset = SeqDataset(seq_tokens, seq_positions, targets, seq_time_diffs)
-    def generate_dataloader(self, batch_size=32, num_workers=0, split_ratio=None):
-        """生成数据加载器.
-        Args:
-            batch_size (int): 批大小，默认32
-            num_workers (int): 数据加载线程数，默认0
-            split_ratio (tuple): 分割比例 (train, val, test)，默认(0.7, 0.1, 0.2)
-        Returns:
-            tuple: (train_loader, val_loader, test_loader)
-        Example:
-            >>> train_loader, val_loader, test_loader = gen.generate_dataloader(
-            ...     batch_size=32,
-            ...     num_workers=4,
-            ...     split_ratio=(0.7, 0.1, 0.2)
-            ... )
+    def generate_dataloader(self, batch_size=32, num_workers=0, split_ratio=None, shuffle=True):
+        """Generate dataloader(s) from the dataset.
+        Parameters
+        ----------
+        batch_size : int, default=32
+            Batch size for DataLoader.
+        num_workers : int, default=0
+            Number of workers for DataLoader.
+        split_ratio : tuple or None, default=None
+            If None, returns a single DataLoader without splitting the data.
+            If tuple (e.g., (0.7, 0.1, 0.2)), splits dataset and returns
+            (train_loader, val_loader, test_loader).
+        shuffle : bool, default=True
+            Whether to shuffle data. Only applies when split_ratio is None.
+            When split_ratio is provided, train data is always shuffled.
+        Returns
+        -------
+        tuple
+            If split_ratio is None: returns (dataloader,)
+            If split_ratio is provided: returns (train_loader, val_loader, test_loader)
+        Examples
+        --------
+        # Case 1: Data already split, just create loader
+        >>> train_gen = SequenceDataGenerator(train_data['seq_tokens'], ...)
+        >>> train_loader = train_gen.generate_dataloader(batch_size=32)[0]
+        # Case 2: Auto-split data into train/val/test
+        >>> all_gen = SequenceDataGenerator(all_data['seq_tokens'], ...)
+        >>> train_loader, val_loader, test_loader = all_gen.generate_dataloader(
+        ...     batch_size=32, split_ratio=(0.7, 0.1, 0.2))
         """
         if split_ratio is None:
-            split_ratio = (0.7, 0.1, 0.2)
+            # No split - data is already divided, just create a single DataLoader
+            dataloader = DataLoader(self.dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
+            return (dataloader,)
-        # 验证分割比例
+        # Split data into train/val/test
         assert abs(sum(split_ratio) - 1.0) < 1e-6, "split_ratio must sum to 1.0"
-        # 计算分割大小
         total_size = len(self.dataset)
         train_size = int(total_size * split_ratio[0])
         val_size = int(total_size * split_ratio[1])
         test_size = total_size - train_size - val_size
-        # 分割数据集
         train_dataset, val_dataset, test_dataset = random_split(self.dataset, [train_size, val_size, test_size])
-        # 创建数据加载器
         train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
         val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
         test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
         return train_loader, val_loader, test_loader

torch_rechub/utils/hstu_utils.py CHANGED Viewed

@@ -6,25 +6,27 @@ import torch.nn as nn
 class RelPosBias(nn.Module):
-    """Relative position bias module.
-    This module is used in HSTU self-attention layers to provide a learnable
-    bias that depends on the relative distance between sequence positions. It
-    can be combined with time-based bucketing when needed.
-    Args:
-        n_heads (int): Number of attention heads.
-        max_seq_len (int): Maximum supported sequence length.
-        num_buckets (int): Number of relative position buckets. Default: 32.
-    Shape:
-        - Output: ``(1, n_heads, seq_len, seq_len)``
-    Example:
-        >>> rel_pos_bias = RelPosBias(n_heads=8, max_seq_len=256)
-        >>> bias = rel_pos_bias(256)
-        >>> bias.shape
-        torch.Size([1, 8, 256, 256])
+    """Relative position bias for attention.
+    Parameters
+    ----------
+    n_heads : int
+        Number of attention heads.
+    max_seq_len : int
+        Maximum supported sequence length.
+    num_buckets : int, default=32
+        Number of relative position buckets.
+    Shape
+    -----
+    Output: ``(1, n_heads, seq_len, seq_len)``
+    Examples
+    --------
+    >>> rel_pos_bias = RelPosBias(n_heads=8, max_seq_len=256)
+    >>> bias = rel_pos_bias(256)
+    >>> bias.shape
+    torch.Size([1, 8, 256, 256])
     """
     def __init__(self, n_heads, max_seq_len, num_buckets=32):
@@ -87,22 +89,20 @@ class RelPosBias(nn.Module):
 class VocabMask(nn.Module):
-    """Vocabulary mask used to constrain generation during inference.
-    At inference time this module can be used to mask out invalid item IDs
-    so that the model never generates them.
-    Args:
-        vocab_size (int): Vocabulary size.
-        invalid_items (list, optional): List of invalid item IDs to be masked.
-    Methods:
-        apply_mask: Apply the mask to logits.
-    Example:
-        >>> mask = VocabMask(vocab_size=1000, invalid_items=[0, 1, 2])
-        >>> logits = torch.randn(32, 1000)
-        >>> masked_logits = mask.apply_mask(logits)
+    """Vocabulary mask to block invalid items at inference.
+    Parameters
+    ----------
+    vocab_size : int
+        Vocabulary size.
+    invalid_items : list, optional
+        IDs to mask out.
+    Examples
+    --------
+    >>> mask = VocabMask(vocab_size=1000, invalid_items=[0, 1, 2])
+    >>> logits = torch.randn(32, 1000)
+    >>> masked_logits = mask.apply_mask(logits)
     """
     def __init__(self, vocab_size, invalid_items=None):
@@ -123,13 +123,17 @@ class VocabMask(nn.Module):
                     self.mask[item_id] = False
     def apply_mask(self, logits):
-        """应用掩码到logits.
-        Args:
-            logits (Tensor): 模型输出logits，shape: (..., vocab_size)
-        Returns:
-            Tensor: 掩码后的logits
+        """Apply mask to logits.
+        Parameters
+        ----------
+        logits : Tensor
+            Model logits, shape ``(..., vocab_size)``.
+        Returns
+        -------
+        Tensor
+            Masked logits.
         """
         # 将无效item的logits设置为极小值
         masked_logits = logits.clone()
@@ -139,26 +143,25 @@ class VocabMask(nn.Module):
 class VocabMapper(object):
-    """Simple mapper between ``item_id`` and ``token_id``.
-    In sequence generation tasks we often treat item IDs as tokens. This
-    helper keeps a trivial identity mapping but makes the intent explicit and
-    allows future extensions (e.g., reserved IDs, remapping, etc.).
-    Args:
-        vocab_size (int): Size of the vocabulary.
-        pad_id (int): ID used for the PAD token. Default: 0.
-        unk_id (int): ID used for unknown tokens. Default: 1.
-    Methods:
-        encode: Map ``item_id`` to ``token_id``.
-        decode: Map ``token_id`` back to ``item_id``.
-    Example:
-        >>> mapper = VocabMapper(vocab_size=1000)
-        >>> item_ids = np.array([10, 20, 30])
-        >>> token_ids = mapper.encode(item_ids)
-        >>> decoded_ids = mapper.decode(token_ids)
+    """Identity mapper between ``item_id`` and ``token_id``.
+    Useful for sequence generation where items are treated as tokens.
+    Parameters
+    ----------
+    vocab_size : int
+        Vocabulary size.
+    pad_id : int, default=0
+        PAD token id.
+    unk_id : int, default=1
+        Unknown token id.
+    Examples
+    --------
+    >>> mapper = VocabMapper(vocab_size=1000)
+    >>> item_ids = np.array([10, 20, 30])
+    >>> token_ids = mapper.encode(item_ids)
+    >>> decoded_ids = mapper.decode(token_ids)
     """
     def __init__(self, vocab_size, pad_id=0, unk_id=1):
@@ -172,26 +175,34 @@ class VocabMapper(object):
         self.token2item = np.arange(vocab_size)
     def encode(self, item_ids):
-        """将item_id转换为token_id.
-        Args:
-            item_ids (np.ndarray): item ID数组
-        Returns:
-            np.ndarray: token ID数组
+        """Convert item_ids to token_ids.
+        Parameters
+        ----------
+        item_ids : np.ndarray
+            Item ids.
+        Returns
+        -------
+        np.ndarray
+            Token ids.
         """
         # 处理超出范围的item_id
         token_ids = np.where((item_ids >= 0) & (item_ids < self.vocab_size), item_ids, self.unk_id)
         return token_ids
     def decode(self, token_ids):
-        """将token_id转换为item_id.
-        Args:
-            token_ids (np.ndarray): token ID数组
-        Returns:
-            np.ndarray: item ID数组
+        """Convert token_ids back to item_ids.
+        Parameters
+        ----------
+        token_ids : np.ndarray
+            Token ids.
+        Returns
+        -------
+        np.ndarray
+            Item ids.
         """
         # 处理超出范围的token_id
         item_ids = np.where((token_ids >= 0) & (token_ids < self.vocab_size), token_ids, self.unk_id)

torch-rechub 0.0.6__py3-none-any.whl → 0.2.0__py3-none-any.whl

torch-rechub 0.0.6py3-none-any.whl → 0.2.0py3-none-any.whl