PyPI - nextrec - Versions diffs - 0.1.1__py3-none-any.whl - Mend

nextrec 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

nextrec/__init__.py +41 -0
nextrec/__version__.py +1 -0
nextrec/basic/__init__.py +0 -0
nextrec/basic/activation.py +92 -0
nextrec/basic/callback.py +35 -0
nextrec/basic/dataloader.py +447 -0
nextrec/basic/features.py +87 -0
nextrec/basic/layers.py +985 -0
nextrec/basic/loggers.py +124 -0
nextrec/basic/metrics.py +557 -0
nextrec/basic/model.py +1438 -0
nextrec/data/__init__.py +27 -0
nextrec/data/data_utils.py +132 -0
nextrec/data/preprocessor.py +662 -0
nextrec/loss/__init__.py +35 -0
nextrec/loss/loss_utils.py +136 -0
nextrec/loss/match_losses.py +294 -0
nextrec/models/generative/hstu.py +0 -0
nextrec/models/generative/tiger.py +0 -0
nextrec/models/match/__init__.py +13 -0
nextrec/models/match/dssm.py +200 -0
nextrec/models/match/dssm_v2.py +162 -0
nextrec/models/match/mind.py +210 -0
nextrec/models/match/sdm.py +253 -0
nextrec/models/match/youtube_dnn.py +172 -0
nextrec/models/multi_task/esmm.py +129 -0
nextrec/models/multi_task/mmoe.py +161 -0
nextrec/models/multi_task/ple.py +260 -0
nextrec/models/multi_task/share_bottom.py +126 -0
nextrec/models/ranking/__init__.py +17 -0
nextrec/models/ranking/afm.py +118 -0
nextrec/models/ranking/autoint.py +140 -0
nextrec/models/ranking/dcn.py +120 -0
nextrec/models/ranking/deepfm.py +95 -0
nextrec/models/ranking/dien.py +214 -0
nextrec/models/ranking/din.py +181 -0
nextrec/models/ranking/fibinet.py +130 -0
nextrec/models/ranking/fm.py +87 -0
nextrec/models/ranking/masknet.py +125 -0
nextrec/models/ranking/pnn.py +128 -0
nextrec/models/ranking/widedeep.py +105 -0
nextrec/models/ranking/xdeepfm.py +117 -0
nextrec/utils/__init__.py +18 -0
nextrec/utils/common.py +14 -0
nextrec/utils/embedding.py +19 -0
nextrec/utils/initializer.py +47 -0
nextrec/utils/optimizer.py +75 -0
nextrec-0.1.1.dist-info/METADATA +302 -0
nextrec-0.1.1.dist-info/RECORD +51 -0
nextrec-0.1.1.dist-info/WHEEL +4 -0
nextrec-0.1.1.dist-info/licenses/LICENSE +21 -0

nextrec/models/multi_task/ple.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""
+Date: create on 09/11/2025
+Author:
+    Yang Zhou,zyaztec@gmail.com
+Reference:
+    [1] Tang H, Liu J, Zhao M, et al. Progressive layered extraction (ple): A novel multi-task learning (mtl) model for personalized recommendations[C]//RecSys. 2020: 269-278.
+"""
+import torch
+import torch.nn as nn
+from nextrec.basic.model import BaseModel
+from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
+from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
+class PLE(BaseModel):
+    """
+    Progressive Layered Extraction
+    PLE is an advanced multi-task learning model that extends MMOE by introducing
+    both task-specific experts and shared experts at each level. It uses a progressive
+    routing mechanism where experts from level k feed into gates at level k+1.
+    This design better captures task-specific and shared information progressively.
+    """
+    @property
+    def model_name(self):
+        return "PLE"
+    @property
+    def task_type(self):
+        return self.task if isinstance(self.task, list) else [self.task]
+    def __init__(self,
+                 dense_features: list[DenseFeature],
+                 sparse_features: list[SparseFeature],
+                 sequence_features: list[SequenceFeature],
+                 shared_expert_params: dict,
+                 specific_expert_params: dict,
+                 num_shared_experts: int,
+                 num_specific_experts: int,
+                 num_levels: int,
+                 tower_params_list: list[dict],
+                 target: list[str],
+                 task: str | list[str] = 'binary',
+                 optimizer: str = "adam",
+                 optimizer_params: dict = {},
+                 loss: str | nn.Module | list[str | nn.Module] | None = "bce",
+                 device: str = 'cpu',
+                 model_id: str = "baseline",
+                 embedding_l1_reg=1e-6,
+                 dense_l1_reg=1e-5,
+                 embedding_l2_reg=1e-5,
+                 dense_l2_reg=1e-4):
+        super(PLE, self).__init__(
+            dense_features=dense_features,
+            sparse_features=sparse_features,
+            sequence_features=sequence_features,
+            target=target,
+            task=task,
+            device=device,
+            embedding_l1_reg=embedding_l1_reg,
+            dense_l1_reg=dense_l1_reg,
+            embedding_l2_reg=embedding_l2_reg,
+            dense_l2_reg=dense_l2_reg,
+            early_stop_patience=20,
+            model_id=model_id
+        )
+        self.loss = loss
+        if self.loss is None:
+            self.loss = "bce"
+        # Number of tasks, experts, and levels
+        self.num_tasks = len(target)
+        self.num_shared_experts = num_shared_experts
+        self.num_specific_experts = num_specific_experts
+        self.num_levels = num_levels
+        if optimizer_params is None:
+            optimizer_params = {}
+        if len(tower_params_list) != self.num_tasks:
+            raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
+        # All features
+        self.all_features = dense_features + sparse_features + sequence_features
+        # Embedding layer
+        self.embedding = EmbeddingLayer(features=self.all_features)
+        # Calculate input dimension
+        emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
+        dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
+        input_dim = emb_dim_total + dense_input_dim
+        # Get expert output dimension
+        if 'dims' in shared_expert_params and len(shared_expert_params['dims']) > 0:
+            expert_output_dim = shared_expert_params['dims'][-1]
+        else:
+            expert_output_dim = input_dim
+        # Build extraction layers (CGC layers)
+        self.shared_experts_layers = nn.ModuleList()  # [num_levels]
+        self.specific_experts_layers = nn.ModuleList()  # [num_levels, num_tasks]
+        self.gates_layers = nn.ModuleList()  # [num_levels, num_tasks + 1] (+1 for shared gate)
+        for level in range(num_levels):
+            # Input dimension for this level
+            level_input_dim = input_dim if level == 0 else expert_output_dim
+            # Shared experts for this level
+            shared_experts = nn.ModuleList()
+            for _ in range(num_shared_experts):
+                expert = MLP(input_dim=level_input_dim, output_layer=False, **shared_expert_params)
+                shared_experts.append(expert)
+            self.shared_experts_layers.append(shared_experts)
+            # Task-specific experts for this level
+            specific_experts_for_tasks = nn.ModuleList()
+            for _ in range(self.num_tasks):
+                task_experts = nn.ModuleList()
+                for _ in range(num_specific_experts):
+                    expert = MLP(input_dim=level_input_dim, output_layer=False, **specific_expert_params)
+                    task_experts.append(expert)
+                specific_experts_for_tasks.append(task_experts)
+            self.specific_experts_layers.append(specific_experts_for_tasks)
+            # Gates for this level (num_tasks task gates + 1 shared gate)
+            gates = nn.ModuleList()
+            # Task-specific gates
+            num_experts_for_task_gate = num_shared_experts + num_specific_experts
+            for _ in range(self.num_tasks):
+                gate = nn.Sequential(
+                    nn.Linear(level_input_dim, num_experts_for_task_gate),
+                    nn.Softmax(dim=1)
+                )
+                gates.append(gate)
+            # Shared gate: contains all tasks' specific experts + shared experts
+            # expert counts = num_shared_experts + num_specific_experts * num_tasks
+            num_experts_for_shared_gate = num_shared_experts + num_specific_experts * self.num_tasks
+            shared_gate = nn.Sequential(
+                nn.Linear(level_input_dim, num_experts_for_shared_gate),
+                nn.Softmax(dim=1)
+            )
+            gates.append(shared_gate)
+            self.gates_layers.append(gates)
+        # Task-specific towers
+        self.towers = nn.ModuleList()
+        for tower_params in tower_params_list:
+            tower = MLP(input_dim=expert_output_dim, output_layer=True, **tower_params)
+            self.towers.append(tower)
+        self.prediction_layer = PredictionLayer(
+            task_type=self.task_type,
+            task_dims=[1] * self.num_tasks
+        )
+        # Register regularization weights
+        self._register_regularization_weights(
+            embedding_attr='embedding',
+            include_modules=['shared_experts_layers', 'specific_experts_layers', 'gates_layers', 'towers']
+        )
+        self.compile(
+            optimizer=optimizer,
+            optimizer_params=optimizer_params,
+            loss=loss
+        )
+    def forward(self, x):
+        # Get all embeddings and flatten
+        input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
+        # Initial features for each task and shared
+        task_fea = [input_flat for _ in range(self.num_tasks)]
+        shared_fea = input_flat
+        # Progressive Layered Extraction: CGC
+        for level in range(self.num_levels):
+            shared_experts = self.shared_experts_layers[level]      # ModuleList[num_shared_experts]
+            specific_experts = self.specific_experts_layers[level]  # ModuleList[num_tasks][num_specific_experts]
+            gates = self.gates_layers[level]                        # ModuleList[num_tasks + 1]
+            # Compute shared experts output for this level
+            # shared_expert_list: List[Tensor[B, expert_dim]]
+            shared_expert_list = [expert(shared_fea) for expert in shared_experts] # type: ignore[list-item]
+            # [num_shared_experts, B, expert_dim]
+            shared_expert_outputs = torch.stack(shared_expert_list, dim=0)
+            all_specific_outputs_for_shared = []
+            # Compute task's gated output and specific outputs
+            new_task_fea = []
+            for task_idx in range(self.num_tasks):
+                # Current input for this task at this level
+                current_task_in = task_fea[task_idx]
+                # Specific task experts for this task
+                task_expert_modules = specific_experts[task_idx] # type: ignore
+                # Specific task expert output list List[Tensor[B, expert_dim]]
+                task_specific_list = []
+                for expert in task_expert_modules:
+                    out = expert(current_task_in)
+                    task_specific_list.append(out)
+                    # All specific task experts are candidates for the shared gate
+                    all_specific_outputs_for_shared.append(out)
+                # [num_specific_taskexperts, B, expert_dim]
+                task_specific_outputs = torch.stack(task_specific_list, dim=0)
+                # Input for gate: shared_experts + own specific task experts
+                # [num_shared + num_specific, B, expert_dim]
+                all_expert_outputs = torch.cat(
+                    [shared_expert_outputs, task_specific_outputs],
+                    dim=0
+                )
+                # [B, num_experts, expert_dim]
+                all_expert_outputs_t = all_expert_outputs.permute(1, 0, 2)
+                # Gate for task (gates[task_idx])
+                # Output shape: [B, num_shared + num_specific]
+                gate_weights = gates[task_idx](current_task_in)
+                # [B, num_experts, 1]
+                gate_weights = gate_weights.unsqueeze(2)
+                # Weighted sum to get this task's features at this level: [B, expert_dim]
+                gated_output = torch.sum(gate_weights * all_expert_outputs_t, dim=1)
+                new_task_fea.append(gated_output)
+            # compute shared gate output
+            # Input for shared gate: specific task experts + shared experts
+            # all_specific_outputs_for_shared: List[Tensor[B, expert_dim]]
+            # shared_expert_list: List[Tensor[B, expert_dim]]
+            all_for_shared_list = all_specific_outputs_for_shared + shared_expert_list
+            # [B, num_all_experts, expert_dim]
+            all_for_shared = torch.stack(all_for_shared_list, dim=1)
+            # [B, num_all_experts]
+            shared_gate_weights = gates[self.num_tasks](shared_fea) # type: ignore
+            # [B, 1, num_all_experts]
+            shared_gate_weights = shared_gate_weights.unsqueeze(1)
+            # weighted sum: [B, 1, expert_dim] → [B, expert_dim]
+            new_shared_fea = torch.bmm(shared_gate_weights, all_for_shared).squeeze(1)
+            task_fea = new_task_fea
+            shared_fea = new_shared_fea
+        # task tower
+        task_outputs = []
+        for task_idx in range(self.num_tasks):
+            tower_output = self.towers[task_idx](task_fea[task_idx])  # [B, 1]
+            task_outputs.append(tower_output)
+        # [B, num_tasks]
+        y = torch.cat(task_outputs, dim=1)
+        return self.prediction_layer(y)

nextrec/models/multi_task/share_bottom.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""
+Date: create on 09/11/2025
+Author:
+    Yang Zhou,zyaztec@gmail.com
+Reference:
+    [1] Caruana R. Multitask learning[J]. Machine learning, 1997, 28: 41-75.
+"""
+import torch
+import torch.nn as nn
+from nextrec.basic.model import BaseModel
+from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
+from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
+class ShareBottom(BaseModel):
+    @property
+    def model_name(self):
+        return "ShareBottom"
+    @property
+    def task_type(self):
+        # Multi-task model, return list of task types
+        return self.task if isinstance(self.task, list) else [self.task]
+    def __init__(self,
+                 dense_features: list[DenseFeature],
+                 sparse_features: list[SparseFeature],
+                 sequence_features: list[SequenceFeature],
+                 bottom_params: dict,
+                 tower_params_list: list[dict],
+                 target: list[str],
+                 task: str | list[str] = 'binary',
+                 optimizer: str = "adam",
+                 optimizer_params: dict = {},
+                 loss: str | nn.Module | list[str | nn.Module] | None = "bce",
+                 device: str = 'cpu',
+                 model_id: str = "baseline",
+                 embedding_l1_reg=1e-6,
+                 dense_l1_reg=1e-5,
+                 embedding_l2_reg=1e-5,
+                 dense_l2_reg=1e-4):
+        super(ShareBottom, self).__init__(
+            dense_features=dense_features,
+            sparse_features=sparse_features,
+            sequence_features=sequence_features,
+            target=target,
+            task=task,
+            device=device,
+            embedding_l1_reg=embedding_l1_reg,
+            dense_l1_reg=dense_l1_reg,
+            embedding_l2_reg=embedding_l2_reg,
+            dense_l2_reg=dense_l2_reg,
+            early_stop_patience=20,
+            model_id=model_id
+        )
+        self.loss = loss
+        if self.loss is None:
+            self.loss = "bce"
+        # Number of tasks
+        self.num_tasks = len(target)
+        if len(tower_params_list) != self.num_tasks:
+            raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
+        # All features
+        self.all_features = dense_features + sparse_features + sequence_features
+        # Embedding layer
+        self.embedding = EmbeddingLayer(features=self.all_features)
+        # Calculate input dimension
+        emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
+        dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
+        input_dim = emb_dim_total + dense_input_dim
+        # Shared bottom network
+        self.bottom = MLP(input_dim=input_dim, output_layer=False, **bottom_params)
+        # Get bottom output dimension
+        if 'dims' in bottom_params and len(bottom_params['dims']) > 0:
+            bottom_output_dim = bottom_params['dims'][-1]
+        else:
+            bottom_output_dim = input_dim
+        # Task-specific towers
+        self.towers = nn.ModuleList()
+        for tower_params in tower_params_list:
+            tower = MLP(input_dim=bottom_output_dim, output_layer=True, **tower_params)
+            self.towers.append(tower)
+        self.prediction_layer = PredictionLayer(
+            task_type=self.task_type,
+            task_dims=[1] * self.num_tasks
+        )
+        # Register regularization weights
+        self._register_regularization_weights(
+            embedding_attr='embedding',
+            include_modules=['bottom', 'towers']
+        )
+        self.compile(
+            optimizer=optimizer,
+            optimizer_params=optimizer_params,
+            loss=loss
+        )
+    def forward(self, x):
+        # Get all embeddings and flatten
+        input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
+        # Shared bottom
+        bottom_output = self.bottom(input_flat)  # [B, bottom_dim]
+        # Task-specific towers
+        task_outputs = []
+        for tower in self.towers:
+            tower_output = tower(bottom_output)  # [B, 1]
+            task_outputs.append(tower_output)
+        # Stack outputs: [B, num_tasks]
+        y = torch.cat(task_outputs, dim=1)
+        return self.prediction_layer(y)

nextrec/models/ranking/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+from .deepfm import DeepFM
+from .autoint import AutoInt
+from .widedeep import WideDeep
+from .xdeepfm import xDeepFM
+from .dcn import DCN
+from .din import DIN
+from .dien import DIEN
+__all__ = [
+    'DeepFM',
+    'AutoInt',
+    'WideDeep',
+    'xDeepFM',
+    'DCN',
+    'DIN',
+    'DIEN',
+]

nextrec/models/ranking/afm.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""
+Date: create on 09/11/2025
+Author:
+    Yang Zhou,zyaztec@gmail.com
+Reference:
+    [1] Xiao J, Ye H, He X, et al. Attentional factorization machines: Learning the weight of
+        feature interactions via attention networks[C]//IJCAI. 2017: 3119-3125.
+"""
+import torch
+import torch.nn as nn
+from nextrec.basic.model import BaseModel
+from nextrec.basic.layers import EmbeddingLayer, LR, PredictionLayer
+from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
+class AFM(BaseModel):
+    @property
+    def model_name(self):
+        return "AFM"
+    @property
+    def task_type(self):
+        return "binary"
+    def __init__(self,
+                 dense_features: list[DenseFeature] | list = [],
+                 sparse_features: list[SparseFeature] | list = [],
+                 sequence_features: list[SequenceFeature] | list = [],
+                 attention_dim: int = 32,
+                 attention_dropout: float = 0.0,
+                 target: list[str] | list = [],
+                 optimizer: str = "adam",
+                 optimizer_params: dict = {},
+                 loss: str | nn.Module | None = "bce",
+                 device: str = 'cpu',
+                 model_id: str = "baseline",
+                 embedding_l1_reg=1e-6,
+                 dense_l1_reg=1e-5,
+                 embedding_l2_reg=1e-5,
+                 dense_l2_reg=1e-4):
+        super(AFM, self).__init__(
+            dense_features=dense_features,
+            sparse_features=sparse_features,
+            sequence_features=sequence_features,
+            target=target,
+            task=self.task_type,
+            device=device,
+            embedding_l1_reg=embedding_l1_reg,
+            dense_l1_reg=dense_l1_reg,
+            embedding_l2_reg=embedding_l2_reg,
+            dense_l2_reg=dense_l2_reg,
+            early_stop_patience=20,
+            model_id=model_id
+        )
+        self.loss = loss
+        if self.loss is None:
+            self.loss = "bce"
+        self.fm_features = sparse_features + sequence_features
+        if len(self.fm_features) < 2:
+            raise ValueError("AFM requires at least two sparse/sequence features to build pairwise interactions.")
+        # Assume uniform embedding dimension across FM fields
+        self.embedding_dim = self.fm_features[0].embedding_dim
+        if any(f.embedding_dim != self.embedding_dim for f in self.fm_features):
+            raise ValueError("All FM features must share the same embedding_dim for AFM.")
+        self.embedding = EmbeddingLayer(features=self.fm_features)
+        fm_input_dim = sum([f.embedding_dim for f in self.fm_features])
+        self.linear = LR(fm_input_dim)
+        self.attention_linear = nn.Linear(self.embedding_dim, attention_dim)
+        self.attention_p = nn.Linear(attention_dim, 1, bias=False)
+        self.attention_dropout = nn.Dropout(attention_dropout)
+        self.output_projection = nn.Linear(self.embedding_dim, 1, bias=False)
+        self.prediction_layer = PredictionLayer(task_type=self.task_type)
+        # Register regularization weights
+        self._register_regularization_weights(
+            embedding_attr='embedding',
+            include_modules=['linear', 'attention_linear', 'attention_p', 'output_projection']
+        )
+        self.compile(
+            optimizer=optimizer,
+            optimizer_params=optimizer_params,
+            loss=loss
+        )
+    def forward(self, x):
+        field_emb = self.embedding(x=x, features=self.fm_features, squeeze_dim=False)  # [B, F, D]
+        input_linear = field_emb.flatten(start_dim=1)
+        y_linear = self.linear(input_linear)
+        interactions = []
+        num_fields = field_emb.shape[1]
+        for i in range(num_fields - 1):
+            vi = field_emb[:, i, :]
+            for j in range(i + 1, num_fields):
+                vj = field_emb[:, j, :]
+                interactions.append(vi * vj)
+        pair_tensor = torch.stack(interactions, dim=1)  # [B, num_pairs, D]
+        attention_scores = torch.tanh(self.attention_linear(pair_tensor))
+        attention_scores = self.attention_p(attention_scores)  # [B, num_pairs, 1]
+        attention_weights = torch.softmax(attention_scores, dim=1)
+        weighted_sum = torch.sum(attention_weights * pair_tensor, dim=1)
+        weighted_sum = self.attention_dropout(weighted_sum)
+        y_afm = self.output_projection(weighted_sum)
+        y = y_linear + y_afm
+        return self.prediction_layer(y)

nextrec/models/ranking/autoint.py ADDED Viewed

@@ -0,0 +1,140 @@
+"""
+Date: create on 09/11/2025
+Author:
+    Yang Zhou,zyaztec@gmail.com
+Reference:
+    [1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
+        self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
+        on information and knowledge management. 2019: 1161-1170.
+        (https://arxiv.org/abs/1810.11921)
+"""
+import torch
+import torch.nn as nn
+from nextrec.basic.model import BaseModel
+from nextrec.basic.layers import EmbeddingLayer, MultiHeadSelfAttention, PredictionLayer
+from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
+class AutoInt(BaseModel):
+    @property
+    def model_name(self):
+        return "AutoInt"
+    @property
+    def task_type(self):
+        return "binary"
+    def __init__(self,
+                 dense_features: list[DenseFeature],
+                 sparse_features: list[SparseFeature],
+                 sequence_features: list[SequenceFeature],
+                 att_layer_num: int = 3,
+                 att_embedding_dim: int = 8,
+                 att_head_num: int = 2,
+                 att_dropout: float = 0.0,
+                 att_use_residual: bool = True,
+                 target: list[str] = [],
+                 optimizer: str = "adam",
+                 optimizer_params: dict = {},
+                 loss: str | nn.Module | None = "bce",
+                 device: str = 'cpu',
+                 model_id: str = "baseline",
+                 embedding_l1_reg=1e-6,
+                 dense_l1_reg=1e-5,
+                 embedding_l2_reg=1e-5,
+                 dense_l2_reg=1e-4):
+        super(AutoInt, self).__init__(
+            dense_features=dense_features,
+            sparse_features=sparse_features,
+            sequence_features=sequence_features,
+            target=target,
+            task=self.task_type,
+            device=device,
+            embedding_l1_reg=embedding_l1_reg,
+            dense_l1_reg=dense_l1_reg,
+            embedding_l2_reg=embedding_l2_reg,
+            dense_l2_reg=dense_l2_reg,
+            early_stop_patience=20,
+            model_id=model_id
+        )
+        self.loss = loss
+        if self.loss is None:
+            self.loss = "bce"
+        self.att_layer_num = att_layer_num
+        self.att_embedding_dim = att_embedding_dim
+        # Use sparse and sequence features for interaction
+        self.interaction_features = sparse_features + sequence_features
+        # All features for embedding
+        self.all_features = dense_features + sparse_features + sequence_features
+        # Embedding layer
+        self.embedding = EmbeddingLayer(features=self.all_features)
+        # Project embeddings to attention embedding dimension
+        num_fields = len(self.interaction_features)
+        total_embedding_dim = sum([f.embedding_dim for f in self.interaction_features])
+        # If embeddings have different dimensions, project them to att_embedding_dim
+        self.need_projection = not all(f.embedding_dim == att_embedding_dim for f in self.interaction_features)
+        self.projection_layers = None
+        if self.need_projection:
+            self.projection_layers = nn.ModuleList([
+                nn.Linear(f.embedding_dim, att_embedding_dim, bias=False)
+                for f in self.interaction_features
+            ])
+        # Multi-head self-attention layers
+        self.attention_layers = nn.ModuleList([
+            MultiHeadSelfAttention(
+                embedding_dim=att_embedding_dim,
+                num_heads=att_head_num,
+                dropout=att_dropout,
+                use_residual=att_use_residual
+            ) for _ in range(att_layer_num)
+        ])
+        # Final prediction layer
+        self.fc = nn.Linear(num_fields * att_embedding_dim, 1)
+        self.prediction_layer = PredictionLayer(task_type=self.task_type)
+        # Register regularization weights
+        self._register_regularization_weights(
+            embedding_attr='embedding',
+            include_modules=['projection_layers', 'attention_layers', 'fc']
+        )
+        self.compile(
+            optimizer=optimizer,
+            optimizer_params=optimizer_params,
+            loss=loss
+        )
+    def forward(self, x):
+        # Get embeddings field-by-field so mixed dimensions can be projected safely
+        field_embeddings = []
+        if len(self.interaction_features) == 0:
+            raise ValueError("AutoInt requires at least one sparse or sequence feature for interactions.")
+        for idx, feature in enumerate(self.interaction_features):
+            feature_emb = self.embedding(x=x, features=[feature], squeeze_dim=False)
+            feature_emb = feature_emb.squeeze(1)  # [B, embedding_dim]
+            if self.need_projection and self.projection_layers is not None:
+                feature_emb = self.projection_layers[idx](feature_emb)
+            field_embeddings.append(feature_emb.unsqueeze(1))  # [B, 1, att_embedding_dim or original_dim]
+        embeddings = torch.cat(field_embeddings, dim=1)
+        # Apply multi-head self-attention layers
+        attention_output = embeddings
+        for att_layer in self.attention_layers:
+            attention_output = att_layer(attention_output)  # [B, num_fields, att_embedding_dim]
+        # Flatten and predict
+        attention_output_flat = attention_output.flatten(start_dim=1)  # [B, num_fields * att_embedding_dim]
+        y = self.fc(attention_output_flat)  # [B, 1]
+        return self.prediction_layer(y)