PyPI - nextrec - Versions diffs - 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

nextrec 0.3.6py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

nextrec/__init__.py +1 -1
nextrec/__version__.py +1 -1
nextrec/basic/activation.py +10 -5
nextrec/basic/callback.py +1 -0
nextrec/basic/features.py +30 -22
nextrec/basic/layers.py +244 -113
nextrec/basic/loggers.py +62 -43
nextrec/basic/metrics.py +268 -119
nextrec/basic/model.py +1373 -443
nextrec/basic/session.py +10 -3
nextrec/cli.py +498 -0
nextrec/data/__init__.py +19 -25
nextrec/data/batch_utils.py +11 -3
nextrec/data/data_processing.py +42 -24
nextrec/data/data_utils.py +26 -15
nextrec/data/dataloader.py +303 -96
nextrec/data/preprocessor.py +320 -199
nextrec/loss/listwise.py +17 -9
nextrec/loss/loss_utils.py +7 -8
nextrec/loss/pairwise.py +2 -0
nextrec/loss/pointwise.py +30 -12
nextrec/models/generative/hstu.py +106 -40
nextrec/models/match/dssm.py +82 -69
nextrec/models/match/dssm_v2.py +72 -58
nextrec/models/match/mind.py +175 -108
nextrec/models/match/sdm.py +104 -88
nextrec/models/match/youtube_dnn.py +73 -60
nextrec/models/multi_task/esmm.py +53 -39
nextrec/models/multi_task/mmoe.py +70 -47
nextrec/models/multi_task/ple.py +107 -50
nextrec/models/multi_task/poso.py +121 -41
nextrec/models/multi_task/share_bottom.py +54 -38
nextrec/models/ranking/afm.py +172 -45
nextrec/models/ranking/autoint.py +84 -61
nextrec/models/ranking/dcn.py +59 -42
nextrec/models/ranking/dcn_v2.py +64 -23
nextrec/models/ranking/deepfm.py +36 -26
nextrec/models/ranking/dien.py +158 -102
nextrec/models/ranking/din.py +88 -60
nextrec/models/ranking/fibinet.py +55 -35
nextrec/models/ranking/fm.py +32 -26
nextrec/models/ranking/masknet.py +95 -34
nextrec/models/ranking/pnn.py +34 -31
nextrec/models/ranking/widedeep.py +37 -29
nextrec/models/ranking/xdeepfm.py +63 -41
nextrec/utils/__init__.py +61 -32
nextrec/utils/config.py +490 -0
nextrec/utils/device.py +52 -12
nextrec/utils/distributed.py +141 -0
nextrec/utils/embedding.py +1 -0
nextrec/utils/feature.py +1 -0
nextrec/utils/file.py +32 -11
nextrec/utils/initializer.py +61 -16
nextrec/utils/optimizer.py +25 -9
nextrec/utils/synthetic_data.py +531 -0
nextrec/utils/tensor.py +24 -13
{nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/METADATA +15 -5
nextrec-0.4.2.dist-info/RECORD +69 -0
nextrec-0.4.2.dist-info/entry_points.txt +2 -0
nextrec-0.3.6.dist-info/RECORD +0 -64
{nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
{nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0

nextrec/loss/listwise.py CHANGED Viewed

@@ -20,10 +20,14 @@ class SampledSoftmaxLoss(nn.Module):
         super().__init__()
         self.reduction = reduction
-    def forward(self, pos_logits: torch.Tensor, neg_logits: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, pos_logits: torch.Tensor, neg_logits: torch.Tensor
+    ) -> torch.Tensor:
         pos_logits = pos_logits.unsqueeze(1)
         all_logits = torch.cat([pos_logits, neg_logits], dim=1)
-        targets = torch.zeros(all_logits.size(0), dtype=torch.long, device=all_logits.device)
+        targets = torch.zeros(
+            all_logits.size(0), dtype=torch.long, device=all_logits.device
+        )
         loss = F.cross_entropy(all_logits, targets, reduction=self.reduction)
         return loss
@@ -87,7 +91,11 @@ class ListMLELoss(nn.Module):
     def forward(self, scores: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
         sorted_labels, sorted_indices = torch.sort(labels, descending=True, dim=1)
         batch_size, list_size = scores.shape
-        batch_indices = torch.arange(batch_size, device=scores.device).unsqueeze(1).expand(-1, list_size)
+        batch_indices = (
+            torch.arange(batch_size, device=scores.device)
+            .unsqueeze(1)
+            .expand(-1, list_size)
+        )
         sorted_scores = scores[batch_indices, sorted_indices]
         loss = torch.tensor(0.0, device=scores.device)
@@ -139,19 +147,19 @@ class ApproxNDCGLoss(nn.Module):
         device = scores.device
         # diff[b, i, j] = (s_j - s_i) / T
-        scores_i = scores.unsqueeze(2)          # [B, L, 1]
-        scores_j = scores.unsqueeze(1)          # [B, 1, L]
+        scores_i = scores.unsqueeze(2)  # [B, L, 1]
+        scores_j = scores.unsqueeze(1)  # [B, 1, L]
         diff = (scores_j - scores_i) / self.temperature  # [B, L, L]
-        P_ji = torch.sigmoid(diff)             # [B, L, L]
+        P_ji = torch.sigmoid(diff)  # [B, L, L]
         eye = torch.eye(list_size, device=device).unsqueeze(0)  # [1, L, L]
         P_ji = P_ji * (1.0 - eye)
-        exp_rank = 1.0 + P_ji.sum(dim=-1)      # [B, L]
+        exp_rank = 1.0 + P_ji.sum(dim=-1)  # [B, L]
         discounts = 1.0 / torch.log2(exp_rank + 1.0)  # [B, L]
-        gains = torch.pow(2.0, labels) - 1.0   # [B, L]
+        gains = torch.pow(2.0, labels) - 1.0  # [B, L]
         approx_dcg = torch.sum(gains * discounts, dim=1)  # [B]
         ideal_dcg = self._ideal_dcg(labels, k)  # [B]
@@ -163,4 +171,4 @@ class ApproxNDCGLoss(nn.Module):
             return loss.mean()
         if self.reduction == "sum":
             return loss.sum()
-        return loss
+        return loss

nextrec/loss/loss_utils.py CHANGED Viewed

@@ -6,8 +6,6 @@ Checkpoint: edit on 29/11/2025
 Author: Yang Zhou, zyaztec@gmail.com
 """
-from typing import Literal
 import torch.nn as nn
 from nextrec.loss.listwise import (
@@ -20,19 +18,19 @@ from nextrec.loss.listwise import (
 from nextrec.loss.pairwise import BPRLoss, HingeLoss, TripletLoss
 from nextrec.loss.pointwise import (
     ClassBalancedFocalLoss,
-    CosineContrastiveLoss,
     FocalLoss,
     WeightedBCELoss,
 )
 VALID_TASK_TYPES = [
-    "binary",
-    "multiclass",
-    "multilabel",
-    "regression",
+    "binary",
+    "multiclass",
+    "multilabel",
+    "regression",
 ]
 def _build_cb_focal(kw):
     if "class_counts" not in kw:
         raise ValueError("class_balanced_focal requires class_counts")
@@ -81,6 +79,7 @@ def get_loss_fn(loss=None, **kw):
     raise ValueError(f"[Loss Error] Unsupported loss: {loss}")
 def get_loss_kwargs(loss_params: dict | list[dict] | None, index: int = 0) -> dict:
     """
     Parse loss_kwargs for each head.
@@ -95,4 +94,4 @@ def get_loss_kwargs(loss_params: dict | list[dict] | None, index: int = 0) -> di
         if index < len(loss_params) and loss_params[index] is not None:
             return loss_params[index]
         return {}
-    return loss_params
+    return loss_params

nextrec/loss/pairwise.py CHANGED Viewed

@@ -36,6 +36,7 @@ class BPRLoss(nn.Module):
             return loss.sum()
         return loss
 class HingeLoss(nn.Module):
     """
     Hinge loss for pairwise ranking.
@@ -59,6 +60,7 @@ class HingeLoss(nn.Module):
             return loss.sum()
         return loss
 class TripletLoss(nn.Module):
     """
     Triplet margin loss with cosine or euclidean distance.

nextrec/loss/pointwise.py CHANGED Viewed

@@ -46,6 +46,7 @@ class WeightedBCELoss(nn.Module):
     If `auto_balance=True` and `pos_weight` is None, the positive weight is
     computed from the batch as (#neg / #pos) for stable imbalance handling.
     """
     def __init__(
         self,
         pos_weight: float | torch.Tensor | None = None,
@@ -59,11 +60,14 @@ class WeightedBCELoss(nn.Module):
         self.auto_balance = auto_balance
         if pos_weight is not None:
-            self.register_buffer("pos_weight", torch.as_tensor(pos_weight, dtype=torch.float32),)
+            self.register_buffer(
+                "pos_weight",
+                torch.as_tensor(pos_weight, dtype=torch.float32),
+            )
         else:
             self.pos_weight = None
-    def _resolve_pos_weight(self, labels: torch.Tensor) -> torch.Tensor:
+    def resolve_pos_weight(self, labels: torch.Tensor) -> torch.Tensor:
         if self.pos_weight is not None:
             return self.pos_weight.to(device=labels.device)
@@ -77,7 +81,7 @@ class WeightedBCELoss(nn.Module):
     def forward(self, inputs: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
         labels = labels.float()
-        current_pos_weight = self._resolve_pos_weight(labels)
+        current_pos_weight = self.resolve_pos_weight(labels)
         current_pos_weight = current_pos_weight.to(inputs.dtype)
         if self.logits:
@@ -120,23 +124,27 @@ class FocalLoss(nn.Module):
         if inputs.dim() > 1 and inputs.size(1) > 1:
             log_probs = F.log_softmax(inputs, dim=1)
             probs = log_probs.exp()
-            targets_one_hot = F.one_hot(targets.long(), num_classes=inputs.size(1)).float()
+            targets_one_hot = F.one_hot(
+                targets.long(), num_classes=inputs.size(1)
+            ).float()
-            alpha = self._get_alpha(inputs)
+            alpha = self.get_alpha(inputs)
             alpha_factor = targets_one_hot * alpha
             focal_weight = (1.0 - probs) ** self.gamma
             loss = torch.sum(alpha_factor * focal_weight * (-log_probs), dim=1)
         else:
             targets = targets.float()
             if self.logits:
-                ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+                ce_loss = F.binary_cross_entropy_with_logits(
+                    inputs, targets, reduction="none"
+                )
                 probs = torch.sigmoid(inputs)
             else:
                 ce_loss = F.binary_cross_entropy(inputs, targets, reduction="none")
                 probs = torch.clamp(inputs, min=1e-6, max=1 - 1e-6)
             p_t = probs * targets + (1 - probs) * (1 - targets)
-            alpha_factor = self._get_binary_alpha(targets, inputs.device)
+            alpha_factor = self.get_binary_alpha(targets, inputs.device)
             focal_weight = (1.0 - p_t) ** self.gamma
             loss = alpha_factor * focal_weight * ce_loss
         if self.reduction == "mean":
@@ -145,27 +153,35 @@ class FocalLoss(nn.Module):
             return loss.sum()
         return loss
-    def _get_alpha(self, inputs: torch.Tensor) -> torch.Tensor:
+    def get_alpha(self, inputs: torch.Tensor) -> torch.Tensor:
         if self.alpha is None:
             return torch.ones_like(inputs)
         if isinstance(self.alpha, torch.Tensor):
             return self.alpha.to(inputs.device)
-        alpha_tensor = torch.tensor(self.alpha, device=inputs.device, dtype=inputs.dtype)
+        alpha_tensor = torch.tensor(
+            self.alpha, device=inputs.device, dtype=inputs.dtype
+        )
         return alpha_tensor
-    def _get_binary_alpha(self, targets: torch.Tensor, device: torch.device) -> torch.Tensor:
+    def get_binary_alpha(
+        self, targets: torch.Tensor, device: torch.device
+    ) -> torch.Tensor:
         if self.alpha is None:
             return torch.ones_like(targets)
         if isinstance(self.alpha, (float, int)):
-            return torch.where(targets == 1, self.alpha, 1 - float(self.alpha)).to(device)
+            return torch.where(targets == 1, self.alpha, 1 - float(self.alpha)).to(
+                device
+            )
         alpha_tensor = torch.tensor(self.alpha, device=device, dtype=targets.dtype)
         return torch.where(targets == 1, alpha_tensor, 1 - alpha_tensor)
 class ClassBalancedFocalLoss(nn.Module):
     """
     Focal loss weighted by effective number of samples per class.
     Reference: "Class-Balanced Loss Based on Effective Number of Samples"
     """
     def __init__(
         self,
         class_counts: Sequence[int] | torch.Tensor,
@@ -183,7 +199,9 @@ class ClassBalancedFocalLoss(nn.Module):
         self.register_buffer("class_weights", weights)
     def forward(self, inputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
-        focal = FocalLoss(gamma=self.gamma, alpha=self.class_weights, reduction="none", logits=True)
+        focal = FocalLoss(
+            gamma=self.gamma, alpha=self.class_weights, reduction="none", logits=True
+        )
         loss = focal(inputs, targets)
         if self.reduction == "mean":
             return loss.mean()

nextrec/models/generative/hstu.py CHANGED Viewed

@@ -1,14 +1,14 @@
 """
 [Info: this version is not released yet, i need to more research on source code and paper]
 Date: create on 01/12/2025
-Checkpoint: edit on 01/12/2025
+Checkpoint: edit on 01/12/2025
 Author: Yang Zhou, zyaztec@gmail.com
 Reference:
 [1] Meta AI. Generative Recommenders (HSTU encoder) — https://github.com/meta-recsys/generative-recommenders
 [2] Ma W, Li P, Chen C, et al. Actions speak louder than words: Trillion-parameter sequential transducers for generative recommendations. arXiv:2402.17152.
-Hierarchical Sequential Transduction Unit (HSTU) is the core encoder behind
-Meta’s Generative Recommenders. It replaces softmax attention with lightweight
+Hierarchical Sequential Transduction Unit (HSTU) is the core encoder behind
+Meta’s Generative Recommenders. It replaces softmax attention with lightweight
 pointwise activations, enabling extremely deep stacks on long behavior sequences.
 In each HSTU layer:
@@ -16,8 +16,8 @@ In each HSTU layer:
   (2) Softmax-free interactions combine QK^T with Relative Attention Bias (RAB) to encode distance
   (3) Aggregated context is modulated by U-gating and mapped back through an output projection
-Stacking layers yields an efficient causal encoder for next-item
-generation. With a tied-embedding LM head, HSTU forms
+Stacking layers yields an efficient causal encoder for next-item
+generation. With a tied-embedding LM head, HSTU forms
 a full generative recommendation model.
 Key Advantages:
@@ -75,7 +75,16 @@ def _relative_position_bucket(
     is_small = n < max_exact
     # when the distance is too far, do log scaling
-    large_val = max_exact + ((torch.log(n.float() / max_exact + 1e-6) / math.log(max_distance / max_exact)) * (num_buckets - max_exact)).long()
+    large_val = (
+        max_exact
+        + (
+            (
+                torch.log(n.float() / max_exact + 1e-6)
+                / math.log(max_distance / max_exact)
+            )
+            * (num_buckets - max_exact)
+        ).long()
+    )
     large_val = torch.clamp(large_val, max=num_buckets - 1)
     buckets = torch.where(is_small, n.long(), large_val)
@@ -104,10 +113,19 @@ class RelativePositionBias(nn.Module):
         # positions: [T]
         ctx = torch.arange(seq_len, device=device)[:, None]
         mem = torch.arange(seq_len, device=device)[None, :]
-        rel_pos = mem - ctx  # a matrix to describe all relative positions for each [i,j] pair, shape = [seq_len, seq_len]
-        buckets = _relative_position_bucket(rel_pos, num_buckets=self.num_buckets, max_distance=self.max_distance,)  # map to buckets
-        values = self.embedding(buckets) # embedding vector for each [i,j] pair, shape = [seq_len, seq_len, embedding_dim=num_heads]
-        return values.permute(2, 0, 1).unsqueeze(0) # [1, num_heads, seq_len, seq_len]
+        rel_pos = (
+            mem - ctx
+        )  # a matrix to describe all relative positions for each [i,j] pair, shape = [seq_len, seq_len]
+        buckets = _relative_position_bucket(
+            rel_pos,
+            num_buckets=self.num_buckets,
+            max_distance=self.max_distance,
+        )  # map to buckets
+        values = self.embedding(
+            buckets
+        )  # embedding vector for each [i,j] pair, shape = [seq_len, seq_len, embedding_dim=num_heads]
+        return values.permute(2, 0, 1).unsqueeze(0)  # [1, num_heads, seq_len, seq_len]
 class HSTUPointwiseAttention(nn.Module):
     """
@@ -123,16 +141,18 @@ class HSTUPointwiseAttention(nn.Module):
         d_model: int,
         num_heads: int,
         dropout: float = 0.1,
-        alpha: float | None = None
+        alpha: float | None = None,
     ):
         super().__init__()
         if d_model % num_heads != 0:
-            raise ValueError(f"[HSTUPointwiseAttention Error] d_model({d_model}) % num_heads({num_heads}) != 0")
+            raise ValueError(
+                f"[HSTUPointwiseAttention Error] d_model({d_model}) % num_heads({num_heads}) != 0"
+            )
         self.d_model = d_model
         self.num_heads = num_heads
         self.d_head = d_model // num_heads
-        self.alpha = alpha if alpha is not None else (self.d_head ** -0.5)
+        self.alpha = alpha if alpha is not None else (self.d_head**-0.5)
         # project input to 4 * d_model for U, V, Q, K
         self.in_proj = nn.Linear(d_model, 4 * d_model, bias=True)
         # project output back to d_model
@@ -150,9 +170,9 @@ class HSTUPointwiseAttention(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,      # [T, T] with 0 or -inf
+        attn_mask: Optional[torch.Tensor] = None,  # [T, T] with 0 or -inf
         key_padding_mask: Optional[torch.Tensor] = None,  # [B, T], True = pad
-        rab: Optional[torch.Tensor] = None,            # [1, H, T, T] or None
+        rab: Optional[torch.Tensor] = None,  # [1, H, T, T] or None
     ) -> torch.Tensor:
         B, T, D = x.shape
@@ -185,8 +205,8 @@ class HSTUPointwiseAttention(nn.Module):
         # padding mask: key_padding_mask is usually [B, T], True = pad
         if key_padding_mask is not None:
             # valid: 1 for non-pad, 0 for pad
-            valid = (~key_padding_mask).float()          # [B, T]
-            valid = valid.view(B, 1, 1, T)               # [B, 1, 1, T]
+            valid = (~key_padding_mask).float()  # [B, T]
+            valid = valid.view(B, 1, 1, T)  # [B, 1, 1, T]
             allowed = allowed * valid
             logits = logits.masked_fill(valid == 0, float("-inf"))
@@ -197,7 +217,7 @@ class HSTUPointwiseAttention(nn.Module):
         attn = attn / denom  # [B, H, T, T]
         AV = torch.matmul(attn, Vh)  # [B, H, T, d_head]
-        AV = AV.transpose(1, 2).contiguous().view(B, T, D) # reshape back to [B, T, D]
+        AV = AV.transpose(1, 2).contiguous().view(B, T, D)  # reshape back to [B, T, D]
         U_flat = Uh.transpose(1, 2).contiguous().view(B, T, D)
         y = self.out_proj(self.dropout(self.norm(AV) * U_flat))  # [B, T, D]
         return y
@@ -218,10 +238,20 @@ class HSTULayer(nn.Module):
         rab_max_distance: int = 128,
     ):
         super().__init__()
-        self.attn = HSTUPointwiseAttention(d_model=d_model, num_heads=num_heads, dropout=dropout)
+        self.attn = HSTUPointwiseAttention(
+            d_model=d_model, num_heads=num_heads, dropout=dropout
+        )
         self.dropout = nn.Dropout(dropout)
         self.use_rab_pos = use_rab_pos
-        self.rel_pos_bias = (RelativePositionBias(num_heads=num_heads, num_buckets=rab_num_buckets, max_distance=rab_max_distance) if use_rab_pos else None)
+        self.rel_pos_bias = (
+            RelativePositionBias(
+                num_heads=num_heads,
+                num_buckets=rab_num_buckets,
+                max_distance=rab_max_distance,
+            )
+            if use_rab_pos
+            else None
+        )
     def forward(
         self,
@@ -236,8 +266,10 @@ class HSTULayer(nn.Module):
         device = x.device
         rab = None
         if self.use_rab_pos:
-            rab = self.rel_pos_bias(seq_len=T, device=device) # [1, H, T, T]
-        out = self.attn(x=x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, rab=rab)
+            rab = self.rel_pos_bias(seq_len=T, device=device)  # [1, H, T, T]
+        out = self.attn(
+            x=x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, rab=rab
+        )
         return x + self.dropout(out)
@@ -255,7 +287,7 @@ class HSTU(BaseModel):
         return "HSTU"
     @property
-    def task_type(self) -> str:
+    def default_task(self) -> str:
         return "multiclass"
     def __init__(
@@ -272,9 +304,9 @@ class HSTU(BaseModel):
         use_rab_pos: bool = True,
         rab_num_buckets: int = 32,
         rab_max_distance: int = 128,
         tie_embeddings: bool = True,
         target: Optional[list[str] | str] = None,
+        task: str | list[str] | None = None,
         optimizer: str = "adam",
         optimizer_params: Optional[dict] = None,
         scheduler: Optional[str] = None,
@@ -288,17 +320,25 @@ class HSTU(BaseModel):
         **kwargs,
     ):
         if not sequence_features:
-            raise ValueError("[HSTU Error] HSTU requires at least one SequenceFeature (user behavior history).")
+            raise ValueError(
+                "[HSTU Error] HSTU requires at least one SequenceFeature (user behavior history)."
+            )
         # demo version: use the first SequenceFeature as the main sequence
         self.history_feature = sequence_features[0]
-        hidden_dim = d_model or max(int(getattr(self.history_feature, "embedding_dim", 0) or 0), 32)
+        hidden_dim = d_model or max(
+            int(getattr(self.history_feature, "embedding_dim", 0) or 0), 32
+        )
         # Make hidden_dim divisible by num_heads
         if hidden_dim % num_heads != 0:
             hidden_dim = num_heads * math.ceil(hidden_dim / num_heads)
-        self.padding_idx = self.history_feature.padding_idx if self.history_feature.padding_idx is not None else 0
+        self.padding_idx = (
+            self.history_feature.padding_idx
+            if self.history_feature.padding_idx is not None
+            else 0
+        )
         self.vocab_size = self.history_feature.vocab_size
         self.max_seq_len = max_seq_len
@@ -307,7 +347,7 @@ class HSTU(BaseModel):
             sparse_features=sparse_features,
             sequence_features=sequence_features,
             target=target,
-            task=self.task_type,
+            task=task or self.default_task,
             device=device,
             embedding_l1_reg=embedding_l1_reg,
             dense_l1_reg=dense_l1_reg,
@@ -326,8 +366,19 @@ class HSTU(BaseModel):
         self.input_dropout = nn.Dropout(dropout)
         # HSTU layers
-        self.layers = nn.ModuleList([HSTULayer(d_model=hidden_dim, num_heads=num_heads, dropout=dropout, use_rab_pos=use_rab_pos,
-                                               rab_num_buckets=rab_num_buckets, rab_max_distance=rab_max_distance) for _ in range(num_layers)])
+        self.layers = nn.ModuleList(
+            [
+                HSTULayer(
+                    d_model=hidden_dim,
+                    num_heads=num_heads,
+                    dropout=dropout,
+                    use_rab_pos=use_rab_pos,
+                    rab_num_buckets=rab_num_buckets,
+                    rab_max_distance=rab_max_distance,
+                )
+                for _ in range(num_layers)
+            ]
+        )
         self.final_norm = nn.LayerNorm(hidden_dim)
         self.lm_head = nn.Linear(hidden_dim, self.vocab_size, bias=False)
@@ -343,8 +394,17 @@ class HSTU(BaseModel):
         loss_params = loss_params or {}
         loss_params.setdefault("ignore_index", self.ignore_index)
-        self.compile(optimizer=optimizer, optimizer_params=optimizer_params, scheduler=scheduler, scheduler_params=scheduler_params, loss="crossentropy", loss_params=loss_params)
-        self.register_regularization_weights(embedding_attr="token_embedding", include_modules=["layers", "lm_head"])
+        self.compile(
+            optimizer=optimizer,
+            optimizer_params=optimizer_params,
+            scheduler=scheduler,
+            scheduler_params=scheduler_params,
+            loss="crossentropy",
+            loss_params=loss_params,
+        )
+        self.register_regularization_weights(
+            embedding_attr="token_embedding", include_modules=["layers", "lm_head"]
+        )
     def _build_causal_mask(self, seq_len: int, device: torch.device) -> torch.Tensor:
         """
@@ -353,7 +413,7 @@ class HSTU(BaseModel):
         """
         if self.causal_mask.numel() == 0 or self.causal_mask.size(0) < seq_len:
             mask = torch.full((seq_len, seq_len), float("-inf"), device=device)
-            mask = torch.triu(mask, diagonal=1)
+            mask = torch.triu(mask, diagonal=1)
             self.causal_mask = mask
         return self.causal_mask[:seq_len, :seq_len]
@@ -364,27 +424,31 @@ class HSTU(BaseModel):
     def forward(self, x: dict[str, torch.Tensor]) -> torch.Tensor:
         seq = x[self.history_feature.name].long()  # [B, T_raw]
-        seq = self._trim_sequence(seq)            # [B, T]
+        seq = self._trim_sequence(seq)  # [B, T]
         B, T = seq.shape
         device = seq.device
         # position ids: [B, T]
         pos_ids = torch.arange(T, device=device).unsqueeze(0).expand(B, -1)
-        token_emb = self.token_embedding(seq)         # [B, T, D]
-        pos_emb = self.position_embedding(pos_ids)    # [B, T, D]
+        token_emb = self.token_embedding(seq)  # [B, T, D]
+        pos_emb = self.position_embedding(pos_ids)  # [B, T, D]
         hidden_states = self.input_dropout(token_emb + pos_emb)
         # padding mask：True = pad
-        padding_mask = seq.eq(self.padding_idx)       # [B, T]
+        padding_mask = seq.eq(self.padding_idx)  # [B, T]
         attn_mask = self._build_causal_mask(seq_len=T, device=device)  # [T, T]
         for layer in self.layers:
-            hidden_states = layer(x=hidden_states, attn_mask=attn_mask, key_padding_mask=padding_mask)
+            hidden_states = layer(
+                x=hidden_states, attn_mask=attn_mask, key_padding_mask=padding_mask
+            )
         hidden_states = self.final_norm(hidden_states)  # [B, T, D]
         valid_lengths = (~padding_mask).sum(dim=1)  # [B]
         last_index = (valid_lengths - 1).clamp(min=0)
-        last_hidden = hidden_states[torch.arange(B, device=device), last_index]  # [B, D]
+        last_hidden = hidden_states[
+            torch.arange(B, device=device), last_index
+        ]  # [B, D]
         logits = self.lm_head(last_hidden)  # [B, vocab_size]
         return logits
@@ -394,6 +458,8 @@ class HSTU(BaseModel):
         y_true: [B] or [B, 1], the id of the next item.
         """
         if y_true is None:
-            raise ValueError("[HSTU-compute_loss] Training requires y_true (next item id).")
+            raise ValueError(
+                "[HSTU-compute_loss] Training requires y_true (next item id)."
+            )
         labels = y_true.view(-1).long()
         return self.loss_fn[0](y_pred, labels)

nextrec 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl

nextrec 0.3.6py3-none-any.whl → 0.4.2py3-none-any.whl