PyPI - nextrec - Versions diffs - 0.2.7__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

nextrec 0.2.7py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

nextrec/__version__.py +1 -1
nextrec/basic/activation.py +4 -8
nextrec/basic/callback.py +1 -1
nextrec/basic/features.py +33 -25
nextrec/basic/layers.py +164 -601
nextrec/basic/loggers.py +3 -4
nextrec/basic/metrics.py +39 -115
nextrec/basic/model.py +248 -174
nextrec/basic/session.py +1 -5
nextrec/data/__init__.py +12 -0
nextrec/data/data_utils.py +3 -27
nextrec/data/dataloader.py +26 -34
nextrec/data/preprocessor.py +2 -1
nextrec/loss/listwise.py +6 -4
nextrec/loss/loss_utils.py +10 -6
nextrec/loss/pairwise.py +5 -3
nextrec/loss/pointwise.py +7 -13
nextrec/models/match/mind.py +110 -1
nextrec/models/multi_task/esmm.py +46 -27
nextrec/models/multi_task/mmoe.py +48 -30
nextrec/models/multi_task/ple.py +156 -141
nextrec/models/multi_task/poso.py +413 -0
nextrec/models/multi_task/share_bottom.py +43 -26
nextrec/models/ranking/__init__.py +2 -0
nextrec/models/ranking/dcn.py +20 -1
nextrec/models/ranking/dcn_v2.py +84 -0
nextrec/models/ranking/deepfm.py +44 -18
nextrec/models/ranking/dien.py +130 -27
nextrec/models/ranking/masknet.py +13 -67
nextrec/models/ranking/widedeep.py +39 -18
nextrec/models/ranking/xdeepfm.py +34 -1
nextrec/utils/common.py +26 -1
nextrec-0.3.1.dist-info/METADATA +306 -0
nextrec-0.3.1.dist-info/RECORD +56 -0
nextrec-0.2.7.dist-info/METADATA +0 -281
nextrec-0.2.7.dist-info/RECORD +0 -54
{nextrec-0.2.7.dist-info → nextrec-0.3.1.dist-info}/WHEEL +0 -0
{nextrec-0.2.7.dist-info → nextrec-0.3.1.dist-info}/licenses/LICENSE +0 -0

nextrec/models/multi_task/esmm.py CHANGED Viewed

@@ -1,7 +1,44 @@
 """
 Date: create on 09/11/2025
+Checkpoint: edit on 29/11/2025
 Author: Yang Zhou,zyaztec@gmail.com
-Reference: [1] Ma X, Zhao L, Huang G, et al. Entire space multi-task model: An effective approach for estimating post-click conversion rate[C]//SIGIR. 2018: 1137-1140.
+Reference:
+[1] Ma X, Zhao L, Huang G, et al. Entire space multi-task model: An effective approach
+for estimating post-click conversion rate[C]//SIGIR. 2018: 1137-1140.
+(https://dl.acm.org/doi/10.1145/3209978.3210007)
+Entire Space Multi-task Model (ESMM) targets CVR estimation by jointly optimizing
+CTR and CTCVR on the full impression space, mitigating sample selection bias and
+conversion sparsity. CTR predicts P(click | impression), CVR predicts P(conversion |
+click), and their product forms CTCVR supervised on impression labels.
+Workflow:
+  (1) Shared embeddings encode all features from impressions
+  (2) CTR tower outputs click probability conditioned on impression
+  (3) CVR tower outputs conversion probability conditioned on click
+  (4) CTCVR = CTR * CVR enables end-to-end training without filtering clicked data
+Key Advantages:
+- Trains on the entire impression space to remove selection bias
+- Transfers rich click signals to sparse conversion prediction via shared embeddings
+- Stable optimization by decomposing CTCVR into well-defined sub-tasks
+- Simple architecture that can pair with other multi-task variants
+ESMM（Entire Space Multi-task Model）用于 CVR 预估，通过在曝光全空间联合训练
+CTR 与 CTCVR，缓解样本选择偏差和转化数据稀疏问题。CTR 预测 P(click|impression)，
+CVR 预测 P(conversion|click)，二者相乘得到 CTCVR 并在曝光标签上直接监督。
+流程：
+  (1) 共享 embedding 统一处理曝光特征
+  (2) CTR 塔输出曝光下的点击概率
+  (3) CVR 塔输出点击后的转化概率
+  (4) CTR 与 CVR 相乘得到 CTCVR，无需只在点击子集上训练
+主要优点：
+- 在曝光空间训练，避免样本选择偏差
+- 通过共享表示将点击信号迁移到稀疏的转化任务
+- 将 CTCVR 分解为子任务，优化稳定
+- 结构简单，可与其它多任务方法组合使用
 """
 import torch
@@ -77,37 +114,22 @@ class ESMM(BaseModel):
         # All features
         self.all_features = dense_features + sparse_features + sequence_features
         # Shared embedding layer
         self.embedding = EmbeddingLayer(features=self.all_features)
+        input_dim = self.embedding.input_dim # Calculate input dimension, better way than below
+        # emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
+        # dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
+        # input_dim = emb_dim_total + dense_input_dim
-        # Calculate input dimension
-        emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
-        dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
-        input_dim = emb_dim_total + dense_input_dim
         # CTR tower
         self.ctr_tower = MLP(input_dim=input_dim, output_layer=True, **ctr_params)
         # CVR tower
         self.cvr_tower = MLP(input_dim=input_dim, output_layer=True, **cvr_params)
-        self.prediction_layer = PredictionLayer(
-            task_type=self.task_type,
-            task_dims=[1, 1]
-        )
+        self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1, 1])
         # Register regularization weights
-        self._register_regularization_weights(
-            embedding_attr='embedding',
-            include_modules=['ctr_tower', 'cvr_tower']
-        )
-        self.compile(
-            optimizer=optimizer,
-            optimizer_params=optimizer_params,
-            loss=loss,
-            loss_params=loss_params,
-        )
+        self._register_regularization_weights(embedding_attr='embedding', include_modules=['ctr_tower', 'cvr_tower'])
+        self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
     def forward(self, x):
         # Get all embeddings and flatten
@@ -119,11 +141,8 @@ class ESMM(BaseModel):
         logits = torch.cat([ctr_logit, cvr_logit], dim=1)
         preds = self.prediction_layer(logits)
         ctr, cvr = preds.chunk(2, dim=1)
-        # CTCVR prediction: P(click & conversion | impression) = P(click) * P(conversion | click)
         ctcvr = ctr * cvr  # [B, 1]
-        # Output: [CTR, CTCVR]
-        # Note: We supervise CTR with click labels and CTCVR with conversion labels
+        # Output: [CTR, CTCVR], We supervise CTR with click labels and CTCVR with conversion labels
         y = torch.cat([ctr, ctcvr], dim=1)  # [B, 2]
         return y  # [B, 2], where y[:, 0] is CTR and y[:, 1] is CTCVR

nextrec/models/multi_task/mmoe.py CHANGED Viewed

@@ -1,7 +1,45 @@
 """
 Date: create on 09/11/2025
+Checkpoint: edit on 29/11/2025
 Author: Yang Zhou,zyaztec@gmail.com
-Reference: [1] Ma J, Zhao Z, Yi X, et al. Modeling task relationships in multi-task learning with multi-gate mixture-of-experts[C]//KDD. 2018: 1930-1939.
+Reference:
+[1] Ma J, Zhao Z, Yi X, et al. Modeling task relationships in multi-task learning with
+multi-gate mixture-of-experts[C]//KDD. 2018: 1930-1939.
+(https://dl.acm.org/doi/10.1145/3219819.3220007)
+Multi-gate Mixture-of-Experts (MMoE) extends shared-bottom multi-task learning by
+introducing multiple experts and task-specific softmax gates. Each task learns its
+own routing weights over the expert pool, enabling both shared and task-specialized
+representations while alleviating gradient conflicts across tasks.
+In each forward pass:
+  (1) Shared embeddings encode all dense/sparse/sequence features
+  (2) Each expert processes the same input to produce candidate shared representations
+  (3) Every task gate outputs a simplex over experts to softly route information
+  (4) The task-specific weighted sum is passed into its tower and prediction head
+Key Advantages:
+- Soft parameter sharing reduces negative transfer between heterogeneous tasks
+- Gates let tasks adaptively allocate expert capacity based on their difficulty
+- Supports many tasks without duplicating full networks
+- Works with mixed feature types via unified embeddings
+- Simple to scale the number of experts or gates for capacity control
+MMoE（Multi-gate Mixture-of-Experts）是多任务学习框架，通过多个专家网络与
+任务特定门控进行软路由，兼顾共享表示与任务特化，减轻梯度冲突问题。
+一次前向流程：
+  (1) 共享 embedding 统一编码稠密、稀疏与序列特征
+  (2) 每个专家对相同输入进行特征变换，得到候选共享表示
+  (3) 每个任务的门控产生对专家的概率分布，完成软选择与加权
+  (4) 加权结果输入到对应任务的塔网络与预测头
+主要优点：
+- 软参数共享，缓解任务间负迁移
+- 按任务难度自适应分配专家容量
+- 便于扩展多任务，而无需复制完整网络
+- 支持多种特征类型的统一建模
+- 专家与门控数量可灵活调节以控制模型容量
 """
 import torch
@@ -75,18 +113,14 @@ class MMOE(BaseModel):
         if len(tower_params_list) != self.num_tasks:
             raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
-        # All features
-        self.all_features = dense_features + sparse_features + sequence_features
-        # Embedding layer
+        self.all_features = dense_features + sparse_features + sequence_features
         self.embedding = EmbeddingLayer(features=self.all_features)
+        input_dim = self.embedding.input_dim
+        # emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
+        # dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
+        # input_dim = emb_dim_total + dense_input_dim
-        # Calculate input dimension
-        emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
-        dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
-        input_dim = emb_dim_total + dense_input_dim
         # Expert networks (shared by all tasks)
         self.experts = nn.ModuleList()
         for _ in range(num_experts):
@@ -102,10 +136,7 @@ class MMOE(BaseModel):
         # Task-specific gates
         self.gates = nn.ModuleList()
         for _ in range(self.num_tasks):
-            gate = nn.Sequential(
-                nn.Linear(input_dim, num_experts),
-                nn.Softmax(dim=1)
-            )
+            gate = nn.Sequential(nn.Linear(input_dim, num_experts), nn.Softmax(dim=1))
             self.gates.append(gate)
         # Task-specific towers
@@ -113,23 +144,10 @@ class MMOE(BaseModel):
         for tower_params in tower_params_list:
             tower = MLP(input_dim=expert_output_dim, output_layer=True, **tower_params)
             self.towers.append(tower)
-        self.prediction_layer = PredictionLayer(
-            task_type=self.task_type,
-            task_dims=[1] * self.num_tasks
-        )
+        self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1] * self.num_tasks)
         # Register regularization weights
-        self._register_regularization_weights(
-            embedding_attr='embedding',
-            include_modules=['experts', 'gates', 'towers']
-        )
-        self.compile(
-            optimizer=optimizer,
-            optimizer_params=optimizer_params,
-            loss=loss,
-            loss_params=loss_params,
-        )
+        self._register_regularization_weights(embedding_attr='embedding', include_modules=['experts', 'gates', 'towers'])
+        self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params,)
     def forward(self, x):
         # Get all embeddings and flatten

nextrec/models/multi_task/ple.py CHANGED Viewed

@@ -1,7 +1,48 @@
 """
 Date: create on 09/11/2025
+Checkpoint: edit on 29/11/2025
 Author: Yang Zhou,zyaztec@gmail.com
-Reference: [1] Tang H, Liu J, Zhao M, et al. Progressive layered extraction (ple): A novel multi-task learning (mtl) model for personalized recommendations[C]//RecSys. 2020: 269-278.
+Reference:
+[1] Tang H, Liu J, Zhao M, et al. Progressive layered extraction (PLE): A novel
+multi-task learning (MTL) model for personalized recommendations[C]//RecSys. 2020: 269-278.
+(https://dl.acm.org/doi/10.1145/3383313.3412236)
+Progressive Layered Extraction (PLE) advances multi-task learning by stacking CGC
+(Customized Gate Control) blocks that mix shared and task-specific experts. Each
+layer routes information via task gates and a shared gate, then feeds the outputs
+forward to deeper layers, progressively disentangling shared vs. task-specific
+signals and mitigating gradient interference.
+Layer workflow:
+  (1) Shared and per-task experts transform the same inputs
+  (2) Task gates select among shared + task-specific experts
+  (3) A shared gate aggregates all experts for the shared branch
+  (4) Outputs become inputs to the next CGC layer (progressive refinement)
+  (5) Final task towers operate on the last-layer task representations
+Key Advantages:
+- Progressive routing reduces negative transfer across layers
+- Explicit shared/specific experts improve feature disentanglement
+- Flexible depth and expert counts to match task complexity
+- Works with heterogeneous features via unified embeddings
+- Stable training by separating gates for shared and task branches
+PLE（Progressive Layered Extraction）通过堆叠 CGC 模块，联合共享与任务特定专家，
+利用任务门与共享门逐层软路由，逐步分离共享与任务差异信息，缓解多任务间的梯度冲突。
+层内流程：
+  (1) 共享与任务专家对同一输入做特征变换
+  (2) 任务门在共享+任务专家上进行软选择
+  (3) 共享门汇总全部专家，更新共享分支
+  (4) 输出作为下一层输入，完成逐层细化
+  (5) 最后由任务塔完成各任务预测
+主要优点：
+- 逐层路由降低负迁移
+- 显式区分共享/特定专家，增强特征解耦
+- 专家数量与层数可按任务复杂度灵活设置
+- 统一 embedding 支持多种特征类型
+- 共享与任务门分离，训练更稳定
 """
 import torch
@@ -11,6 +52,97 @@ from nextrec.basic.model import BaseModel
 from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
 from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
+class CGCLayer(nn.Module):
+    """
+    CGC (Customized Gate Control) block used by PLE.
+    It routes shared and task-specific experts with task gates and a shared gate.
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        num_tasks: int,
+        num_shared_experts: int,
+        num_specific_experts: int,
+        shared_expert_params: dict,
+        specific_expert_params: dict | list[dict],
+    ):
+        super().__init__()
+        if num_tasks < 1:
+            raise ValueError("num_tasks must be >= 1")
+        specific_params_list = self._normalize_specific_params(specific_expert_params, num_tasks)
+        self.output_dim = self._get_output_dim(shared_expert_params, input_dim)
+        specific_dims = [self._get_output_dim(params, input_dim) for params in specific_params_list]
+        dims_set = set(specific_dims + [self.output_dim])
+        if len(dims_set) != 1:
+            raise ValueError(f"Shared/specific expert output dims must match, got {dims_set}")
+        # experts
+        self.shared_experts = nn.ModuleList([MLP(input_dim=input_dim, output_layer=False, **shared_expert_params,) for _ in range(num_shared_experts)])
+        self.specific_experts = nn.ModuleList()
+        for params in specific_params_list:
+            task_experts = nn.ModuleList([MLP(input_dim=input_dim, output_layer=False, **params,) for _ in range(num_specific_experts)])
+            self.specific_experts.append(task_experts)
+        # gates
+        task_gate_expert_num = num_shared_experts + num_specific_experts
+        self.task_gates = nn.ModuleList([nn.Sequential(nn.Linear(input_dim, task_gate_expert_num), nn.Softmax(dim=1),) for _ in range(num_tasks)])
+        shared_gate_expert_num = num_shared_experts + num_specific_experts * num_tasks
+        self.shared_gate = nn.Sequential(nn.Linear(input_dim, shared_gate_expert_num), nn.Softmax(dim=1),)
+        self.num_tasks = num_tasks
+    def forward(
+        self, task_inputs: list[torch.Tensor], shared_input: torch.Tensor
+    ) -> tuple[list[torch.Tensor], torch.Tensor]:
+        if len(task_inputs) != self.num_tasks:
+            raise ValueError(f"Expected {self.num_tasks} task inputs, got {len(task_inputs)}")
+        shared_outputs = [expert(shared_input) for expert in self.shared_experts]
+        shared_stack = torch.stack(shared_outputs, dim=0)  # [num_shared, B, D]
+        new_task_fea: list[torch.Tensor] = []
+        all_specific_for_shared: list[torch.Tensor] = []
+        for task_idx in range(self.num_tasks):
+            task_input = task_inputs[task_idx]
+            task_specific_outputs = [expert(task_input) for expert in self.specific_experts[task_idx]] # type: ignore
+            all_specific_for_shared.extend(task_specific_outputs)
+            specific_stack = torch.stack(task_specific_outputs, dim=0)
+            all_experts = torch.cat([shared_stack, specific_stack], dim=0)
+            all_experts_t = all_experts.permute(1, 0, 2)  # [B, num_expert, D]
+            gate_weights = self.task_gates[task_idx](task_input).unsqueeze(2)
+            gated_output = torch.sum(gate_weights * all_experts_t, dim=1)
+            new_task_fea.append(gated_output)
+        all_for_shared = all_specific_for_shared + shared_outputs
+        all_for_shared_tensor = torch.stack(all_for_shared, dim=1)  # [B, num_all, D]
+        shared_gate_weights = self.shared_gate(shared_input).unsqueeze(1)
+        new_shared = torch.bmm(shared_gate_weights, all_for_shared_tensor).squeeze(1)
+        return new_task_fea, new_shared
+    @staticmethod
+    def _get_output_dim(params: dict, fallback: int) -> int:
+        dims = params.get("dims")
+        if dims:
+            return dims[-1]
+        return fallback
+    @staticmethod
+    def _normalize_specific_params(
+        params: dict | list[dict], num_tasks: int
+    ) -> list[dict]:
+        if isinstance(params, list):
+            if len(params) != num_tasks:
+                raise ValueError(f"Length of specific_expert_params ({len(params)}) must match num_tasks ({num_tasks}).")
+            return [p.copy() for p in params]
+        return [params.copy() for _ in range(num_tasks)]
 class PLE(BaseModel):
     """
@@ -35,7 +167,7 @@ class PLE(BaseModel):
                  sparse_features: list[SparseFeature],
                  sequence_features: list[SequenceFeature],
                  shared_expert_params: dict,
-                 specific_expert_params: dict,
+                 specific_expert_params: dict | list[dict],
                  num_shared_experts: int,
                  num_specific_experts: int,
                  num_levels: int,
@@ -43,7 +175,7 @@ class PLE(BaseModel):
                  target: list[str],
                  task: str | list[str] = 'binary',
                  optimizer: str = "adam",
-                 optimizer_params: dict = {},
+                 optimizer_params: dict | None = None,
                  loss: str | nn.Module | list[str | nn.Module] | None = "bce",
                  loss_params: dict | list[dict] | None = None,
                  device: str = 'cpu',
@@ -71,7 +203,6 @@ class PLE(BaseModel):
         self.loss = loss
         if self.loss is None:
             self.loss = "bce"
         # Number of tasks, experts, and levels
         self.num_tasks = len(target)
         self.num_shared_experts = num_shared_experts
@@ -79,20 +210,16 @@ class PLE(BaseModel):
         self.num_levels = num_levels
         if optimizer_params is None:
             optimizer_params = {}
         if len(tower_params_list) != self.num_tasks:
             raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
-        # All features
-        self.all_features = dense_features + sparse_features + sequence_features
         # Embedding layer
         self.embedding = EmbeddingLayer(features=self.all_features)
         # Calculate input dimension
-        emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
-        dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
-        input_dim = emb_dim_total + dense_input_dim
+        input_dim = self.embedding.input_dim
+        # emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
+        # dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
+        # input_dim = emb_dim_total + dense_input_dim
         # Get expert output dimension
         if 'dims' in shared_expert_params and len(shared_expert_params['dims']) > 0:
@@ -100,74 +227,30 @@ class PLE(BaseModel):
         else:
             expert_output_dim = input_dim
-        # Build extraction layers (CGC layers)
-        self.shared_experts_layers = nn.ModuleList()  # [num_levels]
-        self.specific_experts_layers = nn.ModuleList()  # [num_levels, num_tasks]
-        self.gates_layers = nn.ModuleList()  # [num_levels, num_tasks + 1] (+1 for shared gate)
+        # Build CGC layers
+        self.cgc_layers = nn.ModuleList()
         for level in range(num_levels):
-            # Input dimension for this level
             level_input_dim = input_dim if level == 0 else expert_output_dim
-            # Shared experts for this level
-            shared_experts = nn.ModuleList()
-            for _ in range(num_shared_experts):
-                expert = MLP(input_dim=level_input_dim, output_layer=False, **shared_expert_params)
-                shared_experts.append(expert)
-            self.shared_experts_layers.append(shared_experts)
-            # Task-specific experts for this level
-            specific_experts_for_tasks = nn.ModuleList()
-            for _ in range(self.num_tasks):
-                task_experts = nn.ModuleList()
-                for _ in range(num_specific_experts):
-                    expert = MLP(input_dim=level_input_dim, output_layer=False, **specific_expert_params)
-                    task_experts.append(expert)
-                specific_experts_for_tasks.append(task_experts)
-            self.specific_experts_layers.append(specific_experts_for_tasks)
-            # Gates for this level (num_tasks task gates + 1 shared gate)
-            gates = nn.ModuleList()
-            # Task-specific gates
-            num_experts_for_task_gate = num_shared_experts + num_specific_experts
-            for _ in range(self.num_tasks):
-                gate = nn.Sequential(
-                    nn.Linear(level_input_dim, num_experts_for_task_gate),
-                    nn.Softmax(dim=1)
-                )
-                gates.append(gate)
-            # Shared gate: contains all tasks' specific experts + shared experts
-            # expert counts = num_shared_experts + num_specific_experts * num_tasks
-            num_experts_for_shared_gate = num_shared_experts + num_specific_experts * self.num_tasks
-            shared_gate = nn.Sequential(
-                nn.Linear(level_input_dim, num_experts_for_shared_gate),
-                nn.Softmax(dim=1)
+            cgc_layer = CGCLayer(
+                input_dim=level_input_dim,
+                num_tasks=self.num_tasks,
+                num_shared_experts=num_shared_experts,
+                num_specific_experts=num_specific_experts,
+                shared_expert_params=shared_expert_params,
+                specific_expert_params=specific_expert_params,
             )
-            gates.append(shared_gate)
-            self.gates_layers.append(gates)
+            self.cgc_layers.append(cgc_layer)
+            expert_output_dim = cgc_layer.output_dim
         # Task-specific towers
         self.towers = nn.ModuleList()
         for tower_params in tower_params_list:
             tower = MLP(input_dim=expert_output_dim, output_layer=True, **tower_params)
             self.towers.append(tower)
-        self.prediction_layer = PredictionLayer(
-            task_type=self.task_type,
-            task_dims=[1] * self.num_tasks
-        )
+        self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1] * self.num_tasks)
         # Register regularization weights
-        self._register_regularization_weights(
-            embedding_attr='embedding',
-            include_modules=['shared_experts_layers', 'specific_experts_layers', 'gates_layers', 'towers']
-        )
-        self.compile(
-            optimizer=optimizer,
-            optimizer_params=optimizer_params,
-            loss=loss,
-            loss_params=loss_params,
-        )
+        self._register_regularization_weights(embedding_attr='embedding', include_modules=['cgc_layers', 'towers'])
+        self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=self.loss, loss_params=loss_params)
     def forward(self, x):
         # Get all embeddings and flatten
@@ -178,76 +261,8 @@ class PLE(BaseModel):
         shared_fea = input_flat
         # Progressive Layered Extraction: CGC
-        for level in range(self.num_levels):
-            shared_experts = self.shared_experts_layers[level]      # ModuleList[num_shared_experts]
-            specific_experts = self.specific_experts_layers[level]  # ModuleList[num_tasks][num_specific_experts]
-            gates = self.gates_layers[level]                        # ModuleList[num_tasks + 1]
-            # Compute shared experts output for this level
-            # shared_expert_list: List[Tensor[B, expert_dim]]
-            shared_expert_list = [expert(shared_fea) for expert in shared_experts] # type: ignore[list-item]
-            # [num_shared_experts, B, expert_dim]
-            shared_expert_outputs = torch.stack(shared_expert_list, dim=0)
-            all_specific_outputs_for_shared = []
-            # Compute task's gated output and specific outputs
-            new_task_fea = []
-            for task_idx in range(self.num_tasks):
-                # Current input for this task at this level
-                current_task_in = task_fea[task_idx]
-                # Specific task experts for this task
-                task_expert_modules = specific_experts[task_idx] # type: ignore
-                # Specific task expert output list List[Tensor[B, expert_dim]]
-                task_specific_list = []
-                for expert in task_expert_modules:
-                    out = expert(current_task_in)
-                    task_specific_list.append(out)
-                    # All specific task experts are candidates for the shared gate
-                    all_specific_outputs_for_shared.append(out)
-                # [num_specific_taskexperts, B, expert_dim]
-                task_specific_outputs = torch.stack(task_specific_list, dim=0)
-                # Input for gate: shared_experts + own specific task experts
-                # [num_shared + num_specific, B, expert_dim]
-                all_expert_outputs = torch.cat(
-                    [shared_expert_outputs, task_specific_outputs],
-                    dim=0
-                )
-                # [B, num_experts, expert_dim]
-                all_expert_outputs_t = all_expert_outputs.permute(1, 0, 2)
-                # Gate for task (gates[task_idx])
-                # Output shape: [B, num_shared + num_specific]
-                gate_weights = gates[task_idx](current_task_in)
-                # [B, num_experts, 1]
-                gate_weights = gate_weights.unsqueeze(2)
-                # Weighted sum to get this task's features at this level: [B, expert_dim]
-                gated_output = torch.sum(gate_weights * all_expert_outputs_t, dim=1)
-                new_task_fea.append(gated_output)
-            # compute shared gate output
-            # Input for shared gate: specific task experts + shared experts
-            # all_specific_outputs_for_shared: List[Tensor[B, expert_dim]]
-            # shared_expert_list: List[Tensor[B, expert_dim]]
-            all_for_shared_list = all_specific_outputs_for_shared + shared_expert_list
-            # [B, num_all_experts, expert_dim]
-            all_for_shared = torch.stack(all_for_shared_list, dim=1)
-            # [B, num_all_experts]
-            shared_gate_weights = gates[self.num_tasks](shared_fea) # type: ignore
-            # [B, 1, num_all_experts]
-            shared_gate_weights = shared_gate_weights.unsqueeze(1)
-            # weighted sum: [B, 1, expert_dim] → [B, expert_dim]
-            new_shared_fea = torch.bmm(shared_gate_weights, all_for_shared).squeeze(1)
-            task_fea = new_task_fea
-            shared_fea = new_shared_fea
+        for layer in self.cgc_layers:
+            task_fea, shared_fea = layer(task_fea, shared_fea)
         # task tower
         task_outputs = []
@@ -257,4 +272,4 @@ class PLE(BaseModel):
         # [B, num_tasks]
         y = torch.cat(task_outputs, dim=1)
-        return self.prediction_layer(y)
+        return self.prediction_layer(y)

nextrec 0.2.7__py3-none-any.whl → 0.3.1__py3-none-any.whl

nextrec 0.2.7py3-none-any.whl → 0.3.1py3-none-any.whl