PyPI - nextrec - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

nextrec 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

nextrec/__init__.py +4 -4
nextrec/__version__.py +1 -1
nextrec/basic/activation.py +10 -9
nextrec/basic/callback.py +1 -0
nextrec/basic/dataloader.py +168 -127
nextrec/basic/features.py +24 -27
nextrec/basic/layers.py +328 -159
nextrec/basic/loggers.py +50 -37
nextrec/basic/metrics.py +255 -147
nextrec/basic/model.py +817 -462
nextrec/data/__init__.py +5 -5
nextrec/data/data_utils.py +16 -12
nextrec/data/preprocessor.py +276 -252
nextrec/loss/__init__.py +12 -12
nextrec/loss/loss_utils.py +30 -22
nextrec/loss/match_losses.py +116 -83
nextrec/models/match/__init__.py +5 -5
nextrec/models/match/dssm.py +70 -61
nextrec/models/match/dssm_v2.py +61 -51
nextrec/models/match/mind.py +89 -71
nextrec/models/match/sdm.py +93 -81
nextrec/models/match/youtube_dnn.py +62 -53
nextrec/models/multi_task/esmm.py +49 -43
nextrec/models/multi_task/mmoe.py +65 -56
nextrec/models/multi_task/ple.py +92 -65
nextrec/models/multi_task/share_bottom.py +48 -42
nextrec/models/ranking/__init__.py +7 -7
nextrec/models/ranking/afm.py +39 -30
nextrec/models/ranking/autoint.py +70 -57
nextrec/models/ranking/dcn.py +43 -35
nextrec/models/ranking/deepfm.py +34 -28
nextrec/models/ranking/dien.py +115 -79
nextrec/models/ranking/din.py +84 -60
nextrec/models/ranking/fibinet.py +51 -35
nextrec/models/ranking/fm.py +28 -26
nextrec/models/ranking/masknet.py +31 -31
nextrec/models/ranking/pnn.py +30 -31
nextrec/models/ranking/widedeep.py +36 -31
nextrec/models/ranking/xdeepfm.py +46 -39
nextrec/utils/__init__.py +9 -9
nextrec/utils/embedding.py +1 -1
nextrec/utils/initializer.py +23 -15
nextrec/utils/optimizer.py +14 -10
{nextrec-0.1.1.dist-info → nextrec-0.1.2.dist-info}/METADATA +6 -40
nextrec-0.1.2.dist-info/RECORD +51 -0
nextrec-0.1.1.dist-info/RECORD +0 -51
{nextrec-0.1.1.dist-info → nextrec-0.1.2.dist-info}/WHEEL +0 -0
{nextrec-0.1.1.dist-info → nextrec-0.1.2.dist-info}/licenses/LICENSE +0 -0

nextrec/basic/layers.py CHANGED Viewed

@@ -50,7 +50,14 @@ __all__ = [
 class PredictionLayer(nn.Module):
-    _CLASSIFICATION_TASKS = {"classification", "binary", "ctr", "ranking", "match", "matching"}
+    _CLASSIFICATION_TASKS = {
+        "classification",
+        "binary",
+        "ctr",
+        "ranking",
+        "match",
+        "matching",
+    }
     _REGRESSION_TASKS = {"regression", "continuous"}
     _MULTICLASS_TASKS = {"multiclass", "softmax"}
@@ -213,7 +220,9 @@ class EmbeddingLayer(nn.Module):
                 elif feature.combiner == "concat":
                     pooling_layer = ConcatPooling()
                 else:
-                    raise ValueError(f"Unknown combiner for {feature.name}: {feature.combiner}")
+                    raise ValueError(
+                        f"Unknown combiner for {feature.name}: {feature.combiner}"
+                    )
                 feature_mask = InputMask()(x, feature, seq_input)
                 sparse_embeds.append(pooling_layer(seq_emb, feature_mask).unsqueeze(1))
@@ -245,7 +254,9 @@ class EmbeddingLayer(nn.Module):
             if target_dim is not None:
                 aligned_dense = [
-                    emb.unsqueeze(1) for emb in dense_embeds if emb.shape[-1] == target_dim
+                    emb.unsqueeze(1)
+                    for emb in dense_embeds
+                    if emb.shape[-1] == target_dim
                 ]
                 output_embeddings.extend(aligned_dense)
@@ -257,7 +268,9 @@ class EmbeddingLayer(nn.Module):
         return torch.cat(output_embeddings, dim=1)
-    def _project_dense(self, feature: DenseFeature, x: dict[str, torch.Tensor]) -> torch.Tensor:
+    def _project_dense(
+        self, feature: DenseFeature, x: dict[str, torch.Tensor]
+    ) -> torch.Tensor:
         if feature.name not in x:
             raise KeyError(f"Dense feature '{feature.name}' is missing from input.")
@@ -280,6 +293,7 @@ class EmbeddingLayer(nn.Module):
     def _compute_output_dim(self):
         return
 class InputMask(nn.Module):
     """Utility module to build sequence masks for pooling layers."""
@@ -289,9 +303,9 @@ class InputMask(nn.Module):
     def forward(self, x, fea, seq_tensor=None):
         values = seq_tensor if seq_tensor is not None else x[fea.name]
         if fea.padding_idx is not None:
-            mask = (values.long() != fea.padding_idx)
+            mask = values.long() != fea.padding_idx
         else:
-            mask = (values.long() != 0)
+            mask = values.long() != 0
         if mask.dim() == 1:
             mask = mask.unsqueeze(-1)
         return mask.unsqueeze(1).float()
@@ -319,7 +333,7 @@ class ConcatPooling(nn.Module):
         super().__init__()
     def forward(self, x, mask=None):
-        return x.flatten(start_dim=1, end_dim=2)
+        return x.flatten(start_dim=1, end_dim=2)
 class AveragePooling(nn.Module):
@@ -353,7 +367,9 @@ class SumPooling(nn.Module):
 class MLP(nn.Module):
     """Stacked fully connected layers used in the deep component."""
-    def __init__(self, input_dim, output_layer=True, dims=None, dropout=0, activation="relu"):
+    def __init__(
+        self, input_dim, output_layer=True, dims=None, dropout=0, activation="relu"
+    ):
         super().__init__()
         if dims is None:
             dims = []
@@ -380,7 +396,7 @@ class FM(nn.Module):
         self.reduce_sum = reduce_sum
     def forward(self, x):
-        square_of_sum = torch.sum(x, dim=1)**2
+        square_of_sum = torch.sum(x, dim=1) ** 2
         sum_of_square = torch.sum(x**2, dim=1)
         ix = square_of_sum - sum_of_square
         if self.reduce_sum:
@@ -399,7 +415,16 @@ class CIN(nn.Module):
         prev_dim, fc_input_dim = input_dim, 0
         for i in range(self.num_layers):
             cross_layer_size = cin_size[i]
-            self.conv_layers.append(torch.nn.Conv1d(input_dim * prev_dim, cross_layer_size, 1, stride=1, dilation=1, bias=True))
+            self.conv_layers.append(
+                torch.nn.Conv1d(
+                    input_dim * prev_dim,
+                    cross_layer_size,
+                    1,
+                    stride=1,
+                    dilation=1,
+                    bias=True,
+                )
+            )
             if self.split_half and i != self.num_layers - 1:
                 cross_layer_size //= 2
             prev_dim = cross_layer_size
@@ -421,6 +446,7 @@ class CIN(nn.Module):
             xs.append(x)
         return self.fc(torch.sum(torch.cat(xs, dim=1), 2))
 class CrossLayer(nn.Module):
     """Single cross layer used in DCN (Wang et al., 2017)."""
@@ -440,8 +466,12 @@ class CrossNetwork(nn.Module):
     def __init__(self, input_dim, num_layers):
         super().__init__()
         self.num_layers = num_layers
-        self.w = torch.nn.ModuleList([torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)])
-        self.b = torch.nn.ParameterList([torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)])
+        self.w = torch.nn.ModuleList(
+            [torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)]
+        )
+        self.b = torch.nn.ParameterList(
+            [torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)]
+        )
     def forward(self, x):
         """
@@ -453,21 +483,30 @@ class CrossNetwork(nn.Module):
             x = x0 * xw + self.b[i] + x
         return x
 class CrossNetV2(nn.Module):
     """Vector-wise cross network proposed in DCN V2 (Wang et al., 2021)."""
     def __init__(self, input_dim, num_layers):
         super().__init__()
         self.num_layers = num_layers
-        self.w = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim, bias=False) for _ in range(num_layers)])
-        self.b = torch.nn.ParameterList([torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)])
+        self.w = torch.nn.ModuleList(
+            [
+                torch.nn.Linear(input_dim, input_dim, bias=False)
+                for _ in range(num_layers)
+            ]
+        )
+        self.b = torch.nn.ParameterList(
+            [torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)]
+        )
     def forward(self, x):
         x0 = x
         for i in range(self.num_layers):
-            x =x0*self.w[i](x) + self.b[i] + x
+            x = x0 * self.w[i](x) + self.b[i] + x
         return x
 class CrossNetMix(nn.Module):
     """Mixture of low-rank cross experts from DCN V2 (Wang et al., 2021)."""
@@ -477,18 +516,46 @@ class CrossNetMix(nn.Module):
         self.num_experts = num_experts
         # U: (input_dim, low_rank)
-        self.u_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
-            torch.empty(num_experts, input_dim, low_rank))) for i in range(self.num_layers)])
+        self.u_list = torch.nn.ParameterList(
+            [
+                nn.Parameter(
+                    nn.init.xavier_normal_(
+                        torch.empty(num_experts, input_dim, low_rank)
+                    )
+                )
+                for i in range(self.num_layers)
+            ]
+        )
         # V: (input_dim, low_rank)
-        self.v_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
-            torch.empty(num_experts, input_dim, low_rank))) for i in range(self.num_layers)])
+        self.v_list = torch.nn.ParameterList(
+            [
+                nn.Parameter(
+                    nn.init.xavier_normal_(
+                        torch.empty(num_experts, input_dim, low_rank)
+                    )
+                )
+                for i in range(self.num_layers)
+            ]
+        )
         # C: (low_rank, low_rank)
-        self.c_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
-            torch.empty(num_experts, low_rank, low_rank))) for i in range(self.num_layers)])
-        self.gating = nn.ModuleList([nn.Linear(input_dim, 1, bias=False) for i in range(self.num_experts)])
-        self.bias = torch.nn.ParameterList([nn.Parameter(nn.init.zeros_(
-            torch.empty(input_dim, 1))) for i in range(self.num_layers)])
+        self.c_list = torch.nn.ParameterList(
+            [
+                nn.Parameter(
+                    nn.init.xavier_normal_(torch.empty(num_experts, low_rank, low_rank))
+                )
+                for i in range(self.num_layers)
+            ]
+        )
+        self.gating = nn.ModuleList(
+            [nn.Linear(input_dim, 1, bias=False) for i in range(self.num_experts)]
+        )
+        self.bias = torch.nn.ParameterList(
+            [
+                nn.Parameter(nn.init.zeros_(torch.empty(input_dim, 1)))
+                for i in range(self.num_layers)
+            ]
+        )
     def forward(self, x):
         x_0 = x.unsqueeze(2)  # (bs, in_features, 1)
@@ -503,7 +570,9 @@ class CrossNetMix(nn.Module):
                 # (2) E(x_l)
                 # project the input x_l to $\mathbb{R}^{r}$
-                v_x = torch.matmul(self.v_list[i][expert_id].t(), x_l)  # (bs, low_rank, 1)
+                v_x = torch.matmul(
+                    self.v_list[i][expert_id].t(), x_l
+                )  # (bs, low_rank, 1)
                 # nonlinear activation in low rank space
                 v_x = torch.tanh(v_x)
@@ -511,7 +580,9 @@ class CrossNetMix(nn.Module):
                 v_x = torch.tanh(v_x)
                 # project back to $\mathbb{R}^{d}$
-                uv_x = torch.matmul(self.u_list[i][expert_id], v_x)  # (bs, in_features, 1)
+                uv_x = torch.matmul(
+                    self.u_list[i][expert_id], v_x
+                )  # (bs, in_features, 1)
                 dot_ = uv_x + self.bias[i]
                 dot_ = x_0 * dot_  # Hadamard-product
@@ -519,53 +590,78 @@ class CrossNetMix(nn.Module):
                 output_of_experts.append(dot_.squeeze(2))
             # (3) mixture of low-rank experts
-            output_of_experts = torch.stack(output_of_experts, 2)  # (bs, in_features, num_experts)
-            gating_score_experts = torch.stack(gating_score_experts, 1)  # (bs, num_experts, 1)
+            output_of_experts = torch.stack(
+                output_of_experts, 2
+            )  # (bs, in_features, num_experts)
+            gating_score_experts = torch.stack(
+                gating_score_experts, 1
+            )  # (bs, num_experts, 1)
             moe_out = torch.matmul(output_of_experts, gating_score_experts.softmax(1))
             x_l = moe_out + x_l  # (bs, in_features, 1)
         x_l = x_l.squeeze()  # (bs, in_features)
         return x_l
 class SENETLayer(nn.Module):
     """Squeeze-and-Excitation block adopted by FiBiNET (Huang et al., 2019)."""
     def __init__(self, num_fields, reduction_ratio=3):
         super(SENETLayer, self).__init__()
-        reduced_size = max(1, int(num_fields/ reduction_ratio))
-        self.mlp = nn.Sequential(nn.Linear(num_fields, reduced_size, bias=False),
-                                 nn.ReLU(),
-                                 nn.Linear(reduced_size, num_fields, bias=False),
-                                 nn.ReLU())
+        reduced_size = max(1, int(num_fields / reduction_ratio))
+        self.mlp = nn.Sequential(
+            nn.Linear(num_fields, reduced_size, bias=False),
+            nn.ReLU(),
+            nn.Linear(reduced_size, num_fields, bias=False),
+            nn.ReLU(),
+        )
     def forward(self, x):
         z = torch.mean(x, dim=-1, out=None)
         a = self.mlp(z)
-        v = x*a.unsqueeze(-1)
+        v = x * a.unsqueeze(-1)
         return v
 class BiLinearInteractionLayer(nn.Module):
     """Bilinear feature interaction from FiBiNET (Huang et al., 2019)."""
-    def __init__(self, input_dim, num_fields, bilinear_type = "field_interaction"):
+    def __init__(self, input_dim, num_fields, bilinear_type="field_interaction"):
         super(BiLinearInteractionLayer, self).__init__()
         self.bilinear_type = bilinear_type
         if self.bilinear_type == "field_all":
             self.bilinear_layer = nn.Linear(input_dim, input_dim, bias=False)
         elif self.bilinear_type == "field_each":
-            self.bilinear_layer = nn.ModuleList([nn.Linear(input_dim, input_dim, bias=False) for i in range(num_fields)])
+            self.bilinear_layer = nn.ModuleList(
+                [nn.Linear(input_dim, input_dim, bias=False) for i in range(num_fields)]
+            )
         elif self.bilinear_type == "field_interaction":
-            self.bilinear_layer = nn.ModuleList([nn.Linear(input_dim, input_dim, bias=False) for i,j in combinations(range(num_fields), 2)])
+            self.bilinear_layer = nn.ModuleList(
+                [
+                    nn.Linear(input_dim, input_dim, bias=False)
+                    for i, j in combinations(range(num_fields), 2)
+                ]
+            )
         else:
             raise NotImplementedError()
     def forward(self, x):
         feature_emb = torch.split(x, 1, dim=1)
         if self.bilinear_type == "field_all":
-            bilinear_list = [self.bilinear_layer(v_i)*v_j for v_i, v_j in combinations(feature_emb, 2)]
+            bilinear_list = [
+                self.bilinear_layer(v_i) * v_j
+                for v_i, v_j in combinations(feature_emb, 2)
+            ]
         elif self.bilinear_type == "field_each":
-            bilinear_list = [self.bilinear_layer[i](feature_emb[i])*feature_emb[j] for i,j in combinations(range(len(feature_emb)), 2)]
+            bilinear_list = [
+                self.bilinear_layer[i](feature_emb[i]) * feature_emb[j]
+                for i, j in combinations(range(len(feature_emb)), 2)
+            ]
         elif self.bilinear_type == "field_interaction":
-            bilinear_list = [self.bilinear_layer[i](v[0])*v[1] for i,v in enumerate(combinations(feature_emb, 2))]
+            bilinear_list = [
+                self.bilinear_layer[i](v[0]) * v[1]
+                for i, v in enumerate(combinations(feature_emb, 2))
+            ]
         return torch.cat(bilinear_list, dim=1)
@@ -578,17 +674,23 @@ class MultiInterestSA(nn.Module):
         self.interest_num = interest_num
         if hidden_dim == None:
             self.hidden_dim = self.embedding_dim * 4
-        self.W1 = torch.nn.Parameter(torch.rand(self.embedding_dim, self.hidden_dim), requires_grad=True)
-        self.W2 = torch.nn.Parameter(torch.rand(self.hidden_dim, self.interest_num), requires_grad=True)
-        self.W3 = torch.nn.Parameter(torch.rand(self.embedding_dim, self.embedding_dim), requires_grad=True)
+        self.W1 = torch.nn.Parameter(
+            torch.rand(self.embedding_dim, self.hidden_dim), requires_grad=True
+        )
+        self.W2 = torch.nn.Parameter(
+            torch.rand(self.hidden_dim, self.interest_num), requires_grad=True
+        )
+        self.W3 = torch.nn.Parameter(
+            torch.rand(self.embedding_dim, self.embedding_dim), requires_grad=True
+        )
     def forward(self, seq_emb, mask=None):
-        H = torch.einsum('bse, ed -> bsd', seq_emb, self.W1).tanh()
+        H = torch.einsum("bse, ed -> bsd", seq_emb, self.W1).tanh()
         if mask != None:
-            A = torch.einsum('bsd, dk -> bsk', H, self.W2) + -1.e9 * (1 - mask.float())
+            A = torch.einsum("bsd, dk -> bsk", H, self.W2) + -1.0e9 * (1 - mask.float())
             A = F.softmax(A, dim=1)
         else:
-            A = F.softmax(torch.einsum('bsd, dk -> bsk', H, self.W2), dim=1)
+            A = F.softmax(torch.einsum("bsd, dk -> bsk", H, self.W2), dim=1)
         A = A.permute(0, 2, 1)
         multi_interest_emb = torch.matmul(A, seq_emb)
         return multi_interest_emb
@@ -597,7 +699,15 @@ class MultiInterestSA(nn.Module):
 class CapsuleNetwork(nn.Module):
     """Dynamic routing capsule network used in MIND (Li et al., 2019)."""
-    def __init__(self, embedding_dim, seq_len, bilinear_type=2, interest_num=4, routing_times=3, relu_layer=False):
+    def __init__(
+        self,
+        embedding_dim,
+        seq_len,
+        bilinear_type=2,
+        interest_num=4,
+        routing_times=3,
+        relu_layer=False,
+    ):
         super(CapsuleNetwork, self).__init__()
         self.embedding_dim = embedding_dim  # h
         self.seq_len = seq_len  # s
@@ -607,13 +717,24 @@ class CapsuleNetwork(nn.Module):
         self.relu_layer = relu_layer
         self.stop_grad = True
-        self.relu = nn.Sequential(nn.Linear(self.embedding_dim, self.embedding_dim, bias=False), nn.ReLU())
+        self.relu = nn.Sequential(
+            nn.Linear(self.embedding_dim, self.embedding_dim, bias=False), nn.ReLU()
+        )
         if self.bilinear_type == 0:  # MIND
             self.linear = nn.Linear(self.embedding_dim, self.embedding_dim, bias=False)
         elif self.bilinear_type == 1:
-            self.linear = nn.Linear(self.embedding_dim, self.embedding_dim * self.interest_num, bias=False)
+            self.linear = nn.Linear(
+                self.embedding_dim, self.embedding_dim * self.interest_num, bias=False
+            )
         else:
-            self.w = nn.Parameter(torch.Tensor(1, self.seq_len, self.interest_num * self.embedding_dim, self.embedding_dim))
+            self.w = nn.Parameter(
+                torch.Tensor(
+                    1,
+                    self.seq_len,
+                    self.interest_num * self.embedding_dim,
+                    self.embedding_dim,
+                )
+            )
             nn.init.xavier_uniform_(self.w)
     def forward(self, item_eb, mask):
@@ -624,11 +745,15 @@ class CapsuleNetwork(nn.Module):
             item_eb_hat = self.linear(item_eb)
         else:
             u = torch.unsqueeze(item_eb, dim=2)
-            item_eb_hat = torch.sum(self.w[:, :self.seq_len, :, :] * u, dim=3)
+            item_eb_hat = torch.sum(self.w[:, : self.seq_len, :, :] * u, dim=3)
-        item_eb_hat = torch.reshape(item_eb_hat, (-1, self.seq_len, self.interest_num, self.embedding_dim))
+        item_eb_hat = torch.reshape(
+            item_eb_hat, (-1, self.seq_len, self.interest_num, self.embedding_dim)
+        )
         item_eb_hat = torch.transpose(item_eb_hat, 1, 2).contiguous()
-        item_eb_hat = torch.reshape(item_eb_hat, (-1, self.interest_num, self.seq_len, self.embedding_dim))
+        item_eb_hat = torch.reshape(
+            item_eb_hat, (-1, self.interest_num, self.seq_len, self.embedding_dim)
+        )
         if self.stop_grad:
             item_eb_hat_iter = item_eb_hat.detach()
@@ -636,34 +761,47 @@ class CapsuleNetwork(nn.Module):
             item_eb_hat_iter = item_eb_hat
         if self.bilinear_type > 0:
-            capsule_weight = torch.zeros(item_eb_hat.shape[0],
-                                         self.interest_num,
-                                         self.seq_len,
-                                         device=item_eb.device,
-                                         requires_grad=False)
+            capsule_weight = torch.zeros(
+                item_eb_hat.shape[0],
+                self.interest_num,
+                self.seq_len,
+                device=item_eb.device,
+                requires_grad=False,
+            )
         else:
-            capsule_weight = torch.randn(item_eb_hat.shape[0],
-                                         self.interest_num,
-                                         self.seq_len,
-                                         device=item_eb.device,
-                                         requires_grad=False)
+            capsule_weight = torch.randn(
+                item_eb_hat.shape[0],
+                self.interest_num,
+                self.seq_len,
+                device=item_eb.device,
+                requires_grad=False,
+            )
         for i in range(self.routing_times):  # 动态路由传播3次
             atten_mask = torch.unsqueeze(mask, 1).repeat(1, self.interest_num, 1)
             paddings = torch.zeros_like(atten_mask, dtype=torch.float)
             capsule_softmax_weight = F.softmax(capsule_weight, dim=-1)
-            capsule_softmax_weight = torch.where(torch.eq(atten_mask, 0), paddings, capsule_softmax_weight)
+            capsule_softmax_weight = torch.where(
+                torch.eq(atten_mask, 0), paddings, capsule_softmax_weight
+            )
             capsule_softmax_weight = torch.unsqueeze(capsule_softmax_weight, 2)
             if i < 2:
-                interest_capsule = torch.matmul(capsule_softmax_weight, item_eb_hat_iter)
+                interest_capsule = torch.matmul(
+                    capsule_softmax_weight, item_eb_hat_iter
+                )
                 cap_norm = torch.sum(torch.square(interest_capsule), -1, True)
                 scalar_factor = cap_norm / (1 + cap_norm) / torch.sqrt(cap_norm + 1e-9)
                 interest_capsule = scalar_factor * interest_capsule
-                delta_weight = torch.matmul(item_eb_hat_iter, torch.transpose(interest_capsule, 2, 3).contiguous())
-                delta_weight = torch.reshape(delta_weight, (-1, self.interest_num, self.seq_len))
+                delta_weight = torch.matmul(
+                    item_eb_hat_iter,
+                    torch.transpose(interest_capsule, 2, 3).contiguous(),
+                )
+                delta_weight = torch.reshape(
+                    delta_weight, (-1, self.interest_num, self.seq_len)
+                )
                 capsule_weight = capsule_weight + delta_weight
             else:
                 interest_capsule = torch.matmul(capsule_softmax_weight, item_eb_hat)
@@ -671,7 +809,9 @@ class CapsuleNetwork(nn.Module):
                 scalar_factor = cap_norm / (1 + cap_norm) / torch.sqrt(cap_norm + 1e-9)
                 interest_capsule = scalar_factor * interest_capsule
-        interest_capsule = torch.reshape(interest_capsule, (-1, self.interest_num, self.embedding_dim))
+        interest_capsule = torch.reshape(
+            interest_capsule, (-1, self.interest_num, self.embedding_dim)
+        )
         if self.relu_layer:
             interest_capsule = self.relu(interest_capsule)
@@ -683,18 +823,18 @@ class FFM(nn.Module):
     """Field-aware Factorization Machine (Juan et al., 2016)."""
     def __init__(self, num_fields, reduce_sum=True):
-        super().__init__()
+        super().__init__()
         self.num_fields = num_fields
         self.reduce_sum = reduce_sum
     def forward(self, x):
         # compute (non-redundant) second order field-aware feature crossings
         crossed_embeddings = []
-        for i in range(self.num_fields-1):
-            for j in range(i+1, self.num_fields):
-                crossed_embeddings.append(x[:, i, j, :] *  x[:, j, i, :])
+        for i in range(self.num_fields - 1):
+            for j in range(i + 1, self.num_fields):
+                crossed_embeddings.append(x[:, i, j, :] * x[:, j, i, :])
         crossed_embeddings = torch.stack(crossed_embeddings, dim=1)
         # if reduce_sum is true, the crossing operation is effectively inner product, other wise Hadamard-product
         if self.reduce_sum:
             crossed_embeddings = torch.sum(crossed_embeddings, dim=-1, keepdim=True)
@@ -705,49 +845,57 @@ class CEN(nn.Module):
     """Field-attentive interaction network from FAT-DeepFFM (Wang et al., 2020)."""
     def __init__(self, embed_dim, num_field_crosses, reduction_ratio):
-        super().__init__()
+        super().__init__()
         # convolution weight (Eq.7 FAT-DeepFFM)
-        self.u = torch.nn.Parameter(torch.rand(num_field_crosses, embed_dim), requires_grad=True)
+        self.u = torch.nn.Parameter(
+            torch.rand(num_field_crosses, embed_dim), requires_grad=True
+        )
         # two FC layers that computes the field attention
-        self.mlp_att = MLP(num_field_crosses, dims=[num_field_crosses//reduction_ratio, num_field_crosses], output_layer=False, activation="relu")
-    def forward(self, em):
+        self.mlp_att = MLP(
+            num_field_crosses,
+            dims=[num_field_crosses // reduction_ratio, num_field_crosses],
+            output_layer=False,
+            activation="relu",
+        )
+    def forward(self, em):
         # compute descriptor vector (Eq.7 FAT-DeepFFM), output shape [batch_size, num_field_crosses]
         d = F.relu((self.u.squeeze(0) * em).sum(-1))
-        # compute field attention (Eq.9), output shape [batch_size, num_field_crosses]
-        s = self.mlp_att(d)
+        # compute field attention (Eq.9), output shape [batch_size, num_field_crosses]
+        s = self.mlp_att(d)
         # rescale original embedding with field attention (Eq.10), output shape [batch_size, num_field_crosses, embed_dim]
-        aem = s.unsqueeze(-1) * em
+        aem = s.unsqueeze(-1) * em
         return aem.flatten(start_dim=1)
 class MultiHeadSelfAttention(nn.Module):
     """Multi-head self-attention layer from AutoInt (Song et al., 2019)."""
     def __init__(self, embedding_dim, num_heads=2, dropout=0.0, use_residual=True):
         super().__init__()
         if embedding_dim % num_heads != 0:
-            raise ValueError(f"embedding_dim ({embedding_dim}) must be divisible by num_heads ({num_heads})")
+            raise ValueError(
+                f"embedding_dim ({embedding_dim}) must be divisible by num_heads ({num_heads})"
+            )
         self.embedding_dim = embedding_dim
         self.num_heads = num_heads
         self.head_dim = embedding_dim // num_heads
         self.use_residual = use_residual
         self.W_Q = nn.Linear(embedding_dim, embedding_dim, bias=False)
         self.W_K = nn.Linear(embedding_dim, embedding_dim, bias=False)
         self.W_V = nn.Linear(embedding_dim, embedding_dim, bias=False)
         if self.use_residual:
             self.W_Res = nn.Linear(embedding_dim, embedding_dim, bias=False)
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
         """
         Args:
@@ -756,37 +904,47 @@ class MultiHeadSelfAttention(nn.Module):
             output: [batch_size, num_fields, embedding_dim]
         """
         batch_size, num_fields, _ = x.shape
         # Linear projections
         Q = self.W_Q(x)  # [batch_size, num_fields, embedding_dim]
         K = self.W_K(x)
         V = self.W_V(x)
         # Split into multiple heads: [batch_size, num_heads, num_fields, head_dim]
-        Q = Q.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(1, 2)
-        K = K.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(1, 2)
-        V = V.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(1, 2)
+        Q = Q.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(
+            1, 2
+        )
+        K = K.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(
+            1, 2
+        )
+        V = V.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(
+            1, 2
+        )
         # Attention scores
-        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim**0.5)
         attention_weights = F.softmax(scores, dim=-1)
         attention_weights = self.dropout(attention_weights)
         # Apply attention to values
-        attention_output = torch.matmul(attention_weights, V)  # [batch_size, num_heads, num_fields, head_dim]
+        attention_output = torch.matmul(
+            attention_weights, V
+        )  # [batch_size, num_heads, num_fields, head_dim]
         # Concatenate heads
         attention_output = attention_output.transpose(1, 2).contiguous()
-        attention_output = attention_output.view(batch_size, num_fields, self.embedding_dim)
+        attention_output = attention_output.view(
+            batch_size, num_fields, self.embedding_dim
+        )
         # Residual connection
         if self.use_residual:
             output = attention_output + self.W_Res(x)
         else:
             output = attention_output
         output = F.relu(output)
         return output
@@ -795,25 +953,31 @@ class AttentionPoolingLayer(nn.Module):
     Attention pooling layer for DIN/DIEN
     Computes attention weights between query (candidate item) and keys (user behavior sequence)
     """
-    def __init__(self, embedding_dim, hidden_units=[80, 40], activation='sigmoid', use_softmax=True):
+    def __init__(
+        self,
+        embedding_dim,
+        hidden_units=[80, 40],
+        activation="sigmoid",
+        use_softmax=True,
+    ):
         super().__init__()
         self.embedding_dim = embedding_dim
         self.use_softmax = use_softmax
         # Build attention network
         # Input: [query, key, query-key, query*key] -> 4 * embedding_dim
         input_dim = 4 * embedding_dim
         layers = []
         for hidden_unit in hidden_units:
             layers.append(nn.Linear(input_dim, hidden_unit))
             layers.append(activation_layer(activation))
             input_dim = hidden_unit
         layers.append(nn.Linear(input_dim, 1))
         self.attention_net = nn.Sequential(*layers)
     def forward(self, query, keys, keys_length=None, mask=None):
         """
         Args:
@@ -825,48 +989,52 @@ class AttentionPoolingLayer(nn.Module):
             output: [batch_size, embedding_dim] - attention pooled representation
         """
         batch_size, seq_len, emb_dim = keys.shape
         # Expand query to match sequence length: [batch_size, seq_len, embedding_dim]
         query_expanded = query.unsqueeze(1).expand(-1, seq_len, -1)
         # Compute attention features: [query, key, query-key, query*key]
-        attention_input = torch.cat([
-            query_expanded,
-            keys,
-            query_expanded - keys,
-            query_expanded * keys
-        ], dim=-1)  # [batch_size, seq_len, 4*embedding_dim]
+        attention_input = torch.cat(
+            [query_expanded, keys, query_expanded - keys, query_expanded * keys], dim=-1
+        )  # [batch_size, seq_len, 4*embedding_dim]
         # Compute attention scores
-        attention_scores = self.attention_net(attention_input)  # [batch_size, seq_len, 1]
+        attention_scores = self.attention_net(
+            attention_input
+        )  # [batch_size, seq_len, 1]
         # Apply mask if provided
         if mask is not None:
             attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
         # Apply softmax to get attention weights
         if self.use_softmax:
-            attention_weights = F.softmax(attention_scores, dim=1)  # [batch_size, seq_len, 1]
+            attention_weights = F.softmax(
+                attention_scores, dim=1
+            )  # [batch_size, seq_len, 1]
         else:
             attention_weights = attention_scores
         # Weighted sum of keys
-        output = torch.sum(attention_weights * keys, dim=1)  # [batch_size, embedding_dim]
+        output = torch.sum(
+            attention_weights * keys, dim=1
+        )  # [batch_size, embedding_dim]
         return output
 class DynamicGRU(nn.Module):
     """Dynamic GRU unit with auxiliary loss path from DIEN (Zhou et al., 2019)."""
     """
     GRU with dynamic routing for DIEN
     """
     def __init__(self, input_size, hidden_size, bias=True):
         super().__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
         # GRU parameters
         self.weight_ih = nn.Parameter(torch.randn(3 * hidden_size, input_size))
         self.weight_hh = nn.Parameter(torch.randn(3 * hidden_size, hidden_size))
@@ -874,16 +1042,16 @@ class DynamicGRU(nn.Module):
             self.bias_ih = nn.Parameter(torch.randn(3 * hidden_size))
             self.bias_hh = nn.Parameter(torch.randn(3 * hidden_size))
         else:
-            self.register_parameter('bias_ih', None)
-            self.register_parameter('bias_hh', None)
+            self.register_parameter("bias_ih", None)
+            self.register_parameter("bias_hh", None)
         self.reset_parameters()
     def reset_parameters(self):
         std = 1.0 / (self.hidden_size) ** 0.5
         for weight in self.parameters():
             weight.data.uniform_(-std, std)
     def forward(self, x, att_scores=None):
         """
         Args:
@@ -894,60 +1062,61 @@ class DynamicGRU(nn.Module):
             hidden: [batch_size, hidden_size] - final hidden state
         """
         batch_size, seq_len, _ = x.shape
         # Initialize hidden state
         h = torch.zeros(batch_size, self.hidden_size, device=x.device)
         outputs = []
         for t in range(seq_len):
             x_t = x[:, t, :]  # [batch_size, input_size]
             # GRU computation
             gi = F.linear(x_t, self.weight_ih, self.bias_ih)
             gh = F.linear(h, self.weight_hh, self.bias_hh)
             i_r, i_i, i_n = gi.chunk(3, 1)
             h_r, h_i, h_n = gh.chunk(3, 1)
             resetgate = torch.sigmoid(i_r + h_r)
             inputgate = torch.sigmoid(i_i + h_i)
             newgate = torch.tanh(i_n + resetgate * h_n)
             h = newgate + inputgate * (h - newgate)
             outputs.append(h.unsqueeze(1))
         output = torch.cat(outputs, dim=1)  # [batch_size, seq_len, hidden_size]
         return output, h
 class AUGRU(nn.Module):
     """Attention-aware GRU update gate used in DIEN (Zhou et al., 2019)."""
     """
     Attention-based GRU for DIEN
     Uses attention scores to weight the update of hidden states
     """
     def __init__(self, input_size, hidden_size, bias=True):
         super().__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.weight_ih = nn.Parameter(torch.randn(3 * hidden_size, input_size))
         self.weight_hh = nn.Parameter(torch.randn(3 * hidden_size, hidden_size))
         if bias:
             self.bias_ih = nn.Parameter(torch.randn(3 * hidden_size))
             self.bias_hh = nn.Parameter(torch.randn(3 * hidden_size))
         else:
-            self.register_parameter('bias_ih', None)
-            self.register_parameter('bias_hh', None)
+            self.register_parameter("bias_ih", None)
+            self.register_parameter("bias_hh", None)
         self.reset_parameters()
     def reset_parameters(self):
         std = 1.0 / (self.hidden_size) ** 0.5
         for weight in self.parameters():
             weight.data.uniform_(-std, std)
     def forward(self, x, att_scores):
         """
         Args:
@@ -958,28 +1127,28 @@ class AUGRU(nn.Module):
             hidden: [batch_size, hidden_size] - final hidden state
         """
         batch_size, seq_len, _ = x.shape
         h = torch.zeros(batch_size, self.hidden_size, device=x.device)
         outputs = []
         for t in range(seq_len):
             x_t = x[:, t, :]  # [batch_size, input_size]
             att_t = att_scores[:, t, :]  # [batch_size, 1]
             gi = F.linear(x_t, self.weight_ih, self.bias_ih)
             gh = F.linear(h, self.weight_hh, self.bias_hh)
             i_r, i_i, i_n = gi.chunk(3, 1)
             h_r, h_i, h_n = gh.chunk(3, 1)
             resetgate = torch.sigmoid(i_r + h_r)
             inputgate = torch.sigmoid(i_i + h_i)
             newgate = torch.tanh(i_n + resetgate * h_n)
             # Use attention score to control update
             h = (1 - att_t) * h + att_t * newgate
             outputs.append(h.unsqueeze(1))
         output = torch.cat(outputs, dim=1)
-        return output, h
+        return output, h

nextrec 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

nextrec 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl