nextrec 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. nextrec/__init__.py +41 -0
  2. nextrec/__version__.py +1 -0
  3. nextrec/basic/__init__.py +0 -0
  4. nextrec/basic/activation.py +92 -0
  5. nextrec/basic/callback.py +35 -0
  6. nextrec/basic/dataloader.py +447 -0
  7. nextrec/basic/features.py +87 -0
  8. nextrec/basic/layers.py +985 -0
  9. nextrec/basic/loggers.py +124 -0
  10. nextrec/basic/metrics.py +557 -0
  11. nextrec/basic/model.py +1438 -0
  12. nextrec/data/__init__.py +27 -0
  13. nextrec/data/data_utils.py +132 -0
  14. nextrec/data/preprocessor.py +662 -0
  15. nextrec/loss/__init__.py +35 -0
  16. nextrec/loss/loss_utils.py +136 -0
  17. nextrec/loss/match_losses.py +294 -0
  18. nextrec/models/generative/hstu.py +0 -0
  19. nextrec/models/generative/tiger.py +0 -0
  20. nextrec/models/match/__init__.py +13 -0
  21. nextrec/models/match/dssm.py +200 -0
  22. nextrec/models/match/dssm_v2.py +162 -0
  23. nextrec/models/match/mind.py +210 -0
  24. nextrec/models/match/sdm.py +253 -0
  25. nextrec/models/match/youtube_dnn.py +172 -0
  26. nextrec/models/multi_task/esmm.py +129 -0
  27. nextrec/models/multi_task/mmoe.py +161 -0
  28. nextrec/models/multi_task/ple.py +260 -0
  29. nextrec/models/multi_task/share_bottom.py +126 -0
  30. nextrec/models/ranking/__init__.py +17 -0
  31. nextrec/models/ranking/afm.py +118 -0
  32. nextrec/models/ranking/autoint.py +140 -0
  33. nextrec/models/ranking/dcn.py +120 -0
  34. nextrec/models/ranking/deepfm.py +95 -0
  35. nextrec/models/ranking/dien.py +214 -0
  36. nextrec/models/ranking/din.py +181 -0
  37. nextrec/models/ranking/fibinet.py +130 -0
  38. nextrec/models/ranking/fm.py +87 -0
  39. nextrec/models/ranking/masknet.py +125 -0
  40. nextrec/models/ranking/pnn.py +128 -0
  41. nextrec/models/ranking/widedeep.py +105 -0
  42. nextrec/models/ranking/xdeepfm.py +117 -0
  43. nextrec/utils/__init__.py +18 -0
  44. nextrec/utils/common.py +14 -0
  45. nextrec/utils/embedding.py +19 -0
  46. nextrec/utils/initializer.py +47 -0
  47. nextrec/utils/optimizer.py +75 -0
  48. nextrec-0.1.1.dist-info/METADATA +302 -0
  49. nextrec-0.1.1.dist-info/RECORD +51 -0
  50. nextrec-0.1.1.dist-info/WHEEL +4 -0
  51. nextrec-0.1.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,260 @@
1
+ """
2
+ Date: create on 09/11/2025
3
+ Author:
4
+ Yang Zhou,zyaztec@gmail.com
5
+ Reference:
6
+ [1] Tang H, Liu J, Zhao M, et al. Progressive layered extraction (ple): A novel multi-task learning (mtl) model for personalized recommendations[C]//RecSys. 2020: 269-278.
7
+ """
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+
12
+ from nextrec.basic.model import BaseModel
13
+ from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
14
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
15
+
16
+
17
+ class PLE(BaseModel):
18
+ """
19
+ Progressive Layered Extraction
20
+
21
+ PLE is an advanced multi-task learning model that extends MMOE by introducing
22
+ both task-specific experts and shared experts at each level. It uses a progressive
23
+ routing mechanism where experts from level k feed into gates at level k+1.
24
+ This design better captures task-specific and shared information progressively.
25
+ """
26
+
27
+ @property
28
+ def model_name(self):
29
+ return "PLE"
30
+
31
+ @property
32
+ def task_type(self):
33
+ return self.task if isinstance(self.task, list) else [self.task]
34
+
35
+ def __init__(self,
36
+ dense_features: list[DenseFeature],
37
+ sparse_features: list[SparseFeature],
38
+ sequence_features: list[SequenceFeature],
39
+ shared_expert_params: dict,
40
+ specific_expert_params: dict,
41
+ num_shared_experts: int,
42
+ num_specific_experts: int,
43
+ num_levels: int,
44
+ tower_params_list: list[dict],
45
+ target: list[str],
46
+ task: str | list[str] = 'binary',
47
+ optimizer: str = "adam",
48
+ optimizer_params: dict = {},
49
+ loss: str | nn.Module | list[str | nn.Module] | None = "bce",
50
+ device: str = 'cpu',
51
+ model_id: str = "baseline",
52
+ embedding_l1_reg=1e-6,
53
+ dense_l1_reg=1e-5,
54
+ embedding_l2_reg=1e-5,
55
+ dense_l2_reg=1e-4):
56
+
57
+ super(PLE, self).__init__(
58
+ dense_features=dense_features,
59
+ sparse_features=sparse_features,
60
+ sequence_features=sequence_features,
61
+ target=target,
62
+ task=task,
63
+ device=device,
64
+ embedding_l1_reg=embedding_l1_reg,
65
+ dense_l1_reg=dense_l1_reg,
66
+ embedding_l2_reg=embedding_l2_reg,
67
+ dense_l2_reg=dense_l2_reg,
68
+ early_stop_patience=20,
69
+ model_id=model_id
70
+ )
71
+
72
+ self.loss = loss
73
+ if self.loss is None:
74
+ self.loss = "bce"
75
+
76
+ # Number of tasks, experts, and levels
77
+ self.num_tasks = len(target)
78
+ self.num_shared_experts = num_shared_experts
79
+ self.num_specific_experts = num_specific_experts
80
+ self.num_levels = num_levels
81
+ if optimizer_params is None:
82
+ optimizer_params = {}
83
+
84
+ if len(tower_params_list) != self.num_tasks:
85
+ raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
86
+
87
+ # All features
88
+ self.all_features = dense_features + sparse_features + sequence_features
89
+
90
+ # Embedding layer
91
+ self.embedding = EmbeddingLayer(features=self.all_features)
92
+
93
+ # Calculate input dimension
94
+ emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
95
+ dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
96
+ input_dim = emb_dim_total + dense_input_dim
97
+
98
+ # Get expert output dimension
99
+ if 'dims' in shared_expert_params and len(shared_expert_params['dims']) > 0:
100
+ expert_output_dim = shared_expert_params['dims'][-1]
101
+ else:
102
+ expert_output_dim = input_dim
103
+
104
+ # Build extraction layers (CGC layers)
105
+ self.shared_experts_layers = nn.ModuleList() # [num_levels]
106
+ self.specific_experts_layers = nn.ModuleList() # [num_levels, num_tasks]
107
+ self.gates_layers = nn.ModuleList() # [num_levels, num_tasks + 1] (+1 for shared gate)
108
+
109
+ for level in range(num_levels):
110
+ # Input dimension for this level
111
+ level_input_dim = input_dim if level == 0 else expert_output_dim
112
+
113
+ # Shared experts for this level
114
+ shared_experts = nn.ModuleList()
115
+ for _ in range(num_shared_experts):
116
+ expert = MLP(input_dim=level_input_dim, output_layer=False, **shared_expert_params)
117
+ shared_experts.append(expert)
118
+ self.shared_experts_layers.append(shared_experts)
119
+
120
+ # Task-specific experts for this level
121
+ specific_experts_for_tasks = nn.ModuleList()
122
+ for _ in range(self.num_tasks):
123
+ task_experts = nn.ModuleList()
124
+ for _ in range(num_specific_experts):
125
+ expert = MLP(input_dim=level_input_dim, output_layer=False, **specific_expert_params)
126
+ task_experts.append(expert)
127
+ specific_experts_for_tasks.append(task_experts)
128
+ self.specific_experts_layers.append(specific_experts_for_tasks)
129
+
130
+ # Gates for this level (num_tasks task gates + 1 shared gate)
131
+ gates = nn.ModuleList()
132
+ # Task-specific gates
133
+ num_experts_for_task_gate = num_shared_experts + num_specific_experts
134
+ for _ in range(self.num_tasks):
135
+ gate = nn.Sequential(
136
+ nn.Linear(level_input_dim, num_experts_for_task_gate),
137
+ nn.Softmax(dim=1)
138
+ )
139
+ gates.append(gate)
140
+ # Shared gate: contains all tasks' specific experts + shared experts
141
+ # expert counts = num_shared_experts + num_specific_experts * num_tasks
142
+ num_experts_for_shared_gate = num_shared_experts + num_specific_experts * self.num_tasks
143
+ shared_gate = nn.Sequential(
144
+ nn.Linear(level_input_dim, num_experts_for_shared_gate),
145
+ nn.Softmax(dim=1)
146
+ )
147
+ gates.append(shared_gate)
148
+ self.gates_layers.append(gates)
149
+
150
+ # Task-specific towers
151
+ self.towers = nn.ModuleList()
152
+ for tower_params in tower_params_list:
153
+ tower = MLP(input_dim=expert_output_dim, output_layer=True, **tower_params)
154
+ self.towers.append(tower)
155
+ self.prediction_layer = PredictionLayer(
156
+ task_type=self.task_type,
157
+ task_dims=[1] * self.num_tasks
158
+ )
159
+
160
+ # Register regularization weights
161
+ self._register_regularization_weights(
162
+ embedding_attr='embedding',
163
+ include_modules=['shared_experts_layers', 'specific_experts_layers', 'gates_layers', 'towers']
164
+ )
165
+
166
+ self.compile(
167
+ optimizer=optimizer,
168
+ optimizer_params=optimizer_params,
169
+ loss=loss
170
+ )
171
+
172
+ def forward(self, x):
173
+ # Get all embeddings and flatten
174
+ input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
175
+
176
+ # Initial features for each task and shared
177
+ task_fea = [input_flat for _ in range(self.num_tasks)]
178
+ shared_fea = input_flat
179
+
180
+ # Progressive Layered Extraction: CGC
181
+ for level in range(self.num_levels):
182
+ shared_experts = self.shared_experts_layers[level] # ModuleList[num_shared_experts]
183
+ specific_experts = self.specific_experts_layers[level] # ModuleList[num_tasks][num_specific_experts]
184
+ gates = self.gates_layers[level] # ModuleList[num_tasks + 1]
185
+
186
+ # Compute shared experts output for this level
187
+ # shared_expert_list: List[Tensor[B, expert_dim]]
188
+ shared_expert_list = [expert(shared_fea) for expert in shared_experts] # type: ignore[list-item]
189
+ # [num_shared_experts, B, expert_dim]
190
+ shared_expert_outputs = torch.stack(shared_expert_list, dim=0)
191
+
192
+ all_specific_outputs_for_shared = []
193
+
194
+ # Compute task's gated output and specific outputs
195
+ new_task_fea = []
196
+ for task_idx in range(self.num_tasks):
197
+ # Current input for this task at this level
198
+ current_task_in = task_fea[task_idx]
199
+
200
+ # Specific task experts for this task
201
+ task_expert_modules = specific_experts[task_idx] # type: ignore
202
+
203
+ # Specific task expert output list List[Tensor[B, expert_dim]]
204
+ task_specific_list = []
205
+ for expert in task_expert_modules:
206
+ out = expert(current_task_in)
207
+ task_specific_list.append(out)
208
+ # All specific task experts are candidates for the shared gate
209
+ all_specific_outputs_for_shared.append(out)
210
+
211
+ # [num_specific_taskexperts, B, expert_dim]
212
+ task_specific_outputs = torch.stack(task_specific_list, dim=0)
213
+
214
+ # Input for gate: shared_experts + own specific task experts
215
+ # [num_shared + num_specific, B, expert_dim]
216
+ all_expert_outputs = torch.cat(
217
+ [shared_expert_outputs, task_specific_outputs],
218
+ dim=0
219
+ )
220
+ # [B, num_experts, expert_dim]
221
+ all_expert_outputs_t = all_expert_outputs.permute(1, 0, 2)
222
+
223
+ # Gate for task (gates[task_idx])
224
+ # Output shape: [B, num_shared + num_specific]
225
+ gate_weights = gates[task_idx](current_task_in)
226
+ # [B, num_experts, 1]
227
+ gate_weights = gate_weights.unsqueeze(2)
228
+
229
+ # Weighted sum to get this task's features at this level: [B, expert_dim]
230
+ gated_output = torch.sum(gate_weights * all_expert_outputs_t, dim=1)
231
+ new_task_fea.append(gated_output)
232
+
233
+ # compute shared gate output
234
+ # Input for shared gate: specific task experts + shared experts
235
+ # all_specific_outputs_for_shared: List[Tensor[B, expert_dim]]
236
+ # shared_expert_list: List[Tensor[B, expert_dim]]
237
+ all_for_shared_list = all_specific_outputs_for_shared + shared_expert_list
238
+ # [B, num_all_experts, expert_dim]
239
+ all_for_shared = torch.stack(all_for_shared_list, dim=1)
240
+
241
+ # [B, num_all_experts]
242
+ shared_gate_weights = gates[self.num_tasks](shared_fea) # type: ignore
243
+ # [B, 1, num_all_experts]
244
+ shared_gate_weights = shared_gate_weights.unsqueeze(1)
245
+
246
+ # weighted sum: [B, 1, expert_dim] → [B, expert_dim]
247
+ new_shared_fea = torch.bmm(shared_gate_weights, all_for_shared).squeeze(1)
248
+
249
+ task_fea = new_task_fea
250
+ shared_fea = new_shared_fea
251
+
252
+ # task tower
253
+ task_outputs = []
254
+ for task_idx in range(self.num_tasks):
255
+ tower_output = self.towers[task_idx](task_fea[task_idx]) # [B, 1]
256
+ task_outputs.append(tower_output)
257
+
258
+ # [B, num_tasks]
259
+ y = torch.cat(task_outputs, dim=1)
260
+ return self.prediction_layer(y)
@@ -0,0 +1,126 @@
1
+ """
2
+ Date: create on 09/11/2025
3
+ Author:
4
+ Yang Zhou,zyaztec@gmail.com
5
+ Reference:
6
+ [1] Caruana R. Multitask learning[J]. Machine learning, 1997, 28: 41-75.
7
+ """
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+
12
+ from nextrec.basic.model import BaseModel
13
+ from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
14
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
15
+
16
+
17
+ class ShareBottom(BaseModel):
18
+ @property
19
+ def model_name(self):
20
+ return "ShareBottom"
21
+
22
+ @property
23
+ def task_type(self):
24
+ # Multi-task model, return list of task types
25
+ return self.task if isinstance(self.task, list) else [self.task]
26
+
27
+ def __init__(self,
28
+ dense_features: list[DenseFeature],
29
+ sparse_features: list[SparseFeature],
30
+ sequence_features: list[SequenceFeature],
31
+ bottom_params: dict,
32
+ tower_params_list: list[dict],
33
+ target: list[str],
34
+ task: str | list[str] = 'binary',
35
+ optimizer: str = "adam",
36
+ optimizer_params: dict = {},
37
+ loss: str | nn.Module | list[str | nn.Module] | None = "bce",
38
+ device: str = 'cpu',
39
+ model_id: str = "baseline",
40
+ embedding_l1_reg=1e-6,
41
+ dense_l1_reg=1e-5,
42
+ embedding_l2_reg=1e-5,
43
+ dense_l2_reg=1e-4):
44
+
45
+ super(ShareBottom, self).__init__(
46
+ dense_features=dense_features,
47
+ sparse_features=sparse_features,
48
+ sequence_features=sequence_features,
49
+ target=target,
50
+ task=task,
51
+ device=device,
52
+ embedding_l1_reg=embedding_l1_reg,
53
+ dense_l1_reg=dense_l1_reg,
54
+ embedding_l2_reg=embedding_l2_reg,
55
+ dense_l2_reg=dense_l2_reg,
56
+ early_stop_patience=20,
57
+ model_id=model_id
58
+ )
59
+
60
+ self.loss = loss
61
+ if self.loss is None:
62
+ self.loss = "bce"
63
+
64
+ # Number of tasks
65
+ self.num_tasks = len(target)
66
+ if len(tower_params_list) != self.num_tasks:
67
+ raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
68
+
69
+ # All features
70
+ self.all_features = dense_features + sparse_features + sequence_features
71
+
72
+ # Embedding layer
73
+ self.embedding = EmbeddingLayer(features=self.all_features)
74
+
75
+ # Calculate input dimension
76
+ emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
77
+ dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
78
+ input_dim = emb_dim_total + dense_input_dim
79
+
80
+ # Shared bottom network
81
+ self.bottom = MLP(input_dim=input_dim, output_layer=False, **bottom_params)
82
+
83
+ # Get bottom output dimension
84
+ if 'dims' in bottom_params and len(bottom_params['dims']) > 0:
85
+ bottom_output_dim = bottom_params['dims'][-1]
86
+ else:
87
+ bottom_output_dim = input_dim
88
+
89
+ # Task-specific towers
90
+ self.towers = nn.ModuleList()
91
+ for tower_params in tower_params_list:
92
+ tower = MLP(input_dim=bottom_output_dim, output_layer=True, **tower_params)
93
+ self.towers.append(tower)
94
+ self.prediction_layer = PredictionLayer(
95
+ task_type=self.task_type,
96
+ task_dims=[1] * self.num_tasks
97
+ )
98
+
99
+ # Register regularization weights
100
+ self._register_regularization_weights(
101
+ embedding_attr='embedding',
102
+ include_modules=['bottom', 'towers']
103
+ )
104
+
105
+ self.compile(
106
+ optimizer=optimizer,
107
+ optimizer_params=optimizer_params,
108
+ loss=loss
109
+ )
110
+
111
+ def forward(self, x):
112
+ # Get all embeddings and flatten
113
+ input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
114
+
115
+ # Shared bottom
116
+ bottom_output = self.bottom(input_flat) # [B, bottom_dim]
117
+
118
+ # Task-specific towers
119
+ task_outputs = []
120
+ for tower in self.towers:
121
+ tower_output = tower(bottom_output) # [B, 1]
122
+ task_outputs.append(tower_output)
123
+
124
+ # Stack outputs: [B, num_tasks]
125
+ y = torch.cat(task_outputs, dim=1)
126
+ return self.prediction_layer(y)
@@ -0,0 +1,17 @@
1
+ from .deepfm import DeepFM
2
+ from .autoint import AutoInt
3
+ from .widedeep import WideDeep
4
+ from .xdeepfm import xDeepFM
5
+ from .dcn import DCN
6
+ from .din import DIN
7
+ from .dien import DIEN
8
+
9
+ __all__ = [
10
+ 'DeepFM',
11
+ 'AutoInt',
12
+ 'WideDeep',
13
+ 'xDeepFM',
14
+ 'DCN',
15
+ 'DIN',
16
+ 'DIEN',
17
+ ]
@@ -0,0 +1,118 @@
1
+ """
2
+ Date: create on 09/11/2025
3
+ Author:
4
+ Yang Zhou,zyaztec@gmail.com
5
+ Reference:
6
+ [1] Xiao J, Ye H, He X, et al. Attentional factorization machines: Learning the weight of
7
+ feature interactions via attention networks[C]//IJCAI. 2017: 3119-3125.
8
+ """
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+
13
+ from nextrec.basic.model import BaseModel
14
+ from nextrec.basic.layers import EmbeddingLayer, LR, PredictionLayer
15
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
16
+
17
+
18
+ class AFM(BaseModel):
19
+ @property
20
+ def model_name(self):
21
+ return "AFM"
22
+
23
+ @property
24
+ def task_type(self):
25
+ return "binary"
26
+
27
+ def __init__(self,
28
+ dense_features: list[DenseFeature] | list = [],
29
+ sparse_features: list[SparseFeature] | list = [],
30
+ sequence_features: list[SequenceFeature] | list = [],
31
+ attention_dim: int = 32,
32
+ attention_dropout: float = 0.0,
33
+ target: list[str] | list = [],
34
+ optimizer: str = "adam",
35
+ optimizer_params: dict = {},
36
+ loss: str | nn.Module | None = "bce",
37
+ device: str = 'cpu',
38
+ model_id: str = "baseline",
39
+ embedding_l1_reg=1e-6,
40
+ dense_l1_reg=1e-5,
41
+ embedding_l2_reg=1e-5,
42
+ dense_l2_reg=1e-4):
43
+
44
+ super(AFM, self).__init__(
45
+ dense_features=dense_features,
46
+ sparse_features=sparse_features,
47
+ sequence_features=sequence_features,
48
+ target=target,
49
+ task=self.task_type,
50
+ device=device,
51
+ embedding_l1_reg=embedding_l1_reg,
52
+ dense_l1_reg=dense_l1_reg,
53
+ embedding_l2_reg=embedding_l2_reg,
54
+ dense_l2_reg=dense_l2_reg,
55
+ early_stop_patience=20,
56
+ model_id=model_id
57
+ )
58
+
59
+ self.loss = loss
60
+ if self.loss is None:
61
+ self.loss = "bce"
62
+
63
+ self.fm_features = sparse_features + sequence_features
64
+ if len(self.fm_features) < 2:
65
+ raise ValueError("AFM requires at least two sparse/sequence features to build pairwise interactions.")
66
+
67
+ # Assume uniform embedding dimension across FM fields
68
+ self.embedding_dim = self.fm_features[0].embedding_dim
69
+ if any(f.embedding_dim != self.embedding_dim for f in self.fm_features):
70
+ raise ValueError("All FM features must share the same embedding_dim for AFM.")
71
+
72
+ self.embedding = EmbeddingLayer(features=self.fm_features)
73
+
74
+ fm_input_dim = sum([f.embedding_dim for f in self.fm_features])
75
+ self.linear = LR(fm_input_dim)
76
+
77
+ self.attention_linear = nn.Linear(self.embedding_dim, attention_dim)
78
+ self.attention_p = nn.Linear(attention_dim, 1, bias=False)
79
+ self.attention_dropout = nn.Dropout(attention_dropout)
80
+ self.output_projection = nn.Linear(self.embedding_dim, 1, bias=False)
81
+ self.prediction_layer = PredictionLayer(task_type=self.task_type)
82
+
83
+ # Register regularization weights
84
+ self._register_regularization_weights(
85
+ embedding_attr='embedding',
86
+ include_modules=['linear', 'attention_linear', 'attention_p', 'output_projection']
87
+ )
88
+
89
+ self.compile(
90
+ optimizer=optimizer,
91
+ optimizer_params=optimizer_params,
92
+ loss=loss
93
+ )
94
+
95
+ def forward(self, x):
96
+ field_emb = self.embedding(x=x, features=self.fm_features, squeeze_dim=False) # [B, F, D]
97
+ input_linear = field_emb.flatten(start_dim=1)
98
+ y_linear = self.linear(input_linear)
99
+
100
+ interactions = []
101
+ num_fields = field_emb.shape[1]
102
+ for i in range(num_fields - 1):
103
+ vi = field_emb[:, i, :]
104
+ for j in range(i + 1, num_fields):
105
+ vj = field_emb[:, j, :]
106
+ interactions.append(vi * vj)
107
+
108
+ pair_tensor = torch.stack(interactions, dim=1) # [B, num_pairs, D]
109
+ attention_scores = torch.tanh(self.attention_linear(pair_tensor))
110
+ attention_scores = self.attention_p(attention_scores) # [B, num_pairs, 1]
111
+ attention_weights = torch.softmax(attention_scores, dim=1)
112
+
113
+ weighted_sum = torch.sum(attention_weights * pair_tensor, dim=1)
114
+ weighted_sum = self.attention_dropout(weighted_sum)
115
+ y_afm = self.output_projection(weighted_sum)
116
+
117
+ y = y_linear + y_afm
118
+ return self.prediction_layer(y)
@@ -0,0 +1,140 @@
1
+ """
2
+ Date: create on 09/11/2025
3
+ Author:
4
+ Yang Zhou,zyaztec@gmail.com
5
+ Reference:
6
+ [1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
7
+ self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
8
+ on information and knowledge management. 2019: 1161-1170.
9
+ (https://arxiv.org/abs/1810.11921)
10
+ """
11
+
12
+ import torch
13
+ import torch.nn as nn
14
+
15
+ from nextrec.basic.model import BaseModel
16
+ from nextrec.basic.layers import EmbeddingLayer, MultiHeadSelfAttention, PredictionLayer
17
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
18
+
19
+
20
+ class AutoInt(BaseModel):
21
+ @property
22
+ def model_name(self):
23
+ return "AutoInt"
24
+
25
+ @property
26
+ def task_type(self):
27
+ return "binary"
28
+
29
+ def __init__(self,
30
+ dense_features: list[DenseFeature],
31
+ sparse_features: list[SparseFeature],
32
+ sequence_features: list[SequenceFeature],
33
+ att_layer_num: int = 3,
34
+ att_embedding_dim: int = 8,
35
+ att_head_num: int = 2,
36
+ att_dropout: float = 0.0,
37
+ att_use_residual: bool = True,
38
+ target: list[str] = [],
39
+ optimizer: str = "adam",
40
+ optimizer_params: dict = {},
41
+ loss: str | nn.Module | None = "bce",
42
+ device: str = 'cpu',
43
+ model_id: str = "baseline",
44
+ embedding_l1_reg=1e-6,
45
+ dense_l1_reg=1e-5,
46
+ embedding_l2_reg=1e-5,
47
+ dense_l2_reg=1e-4):
48
+
49
+ super(AutoInt, self).__init__(
50
+ dense_features=dense_features,
51
+ sparse_features=sparse_features,
52
+ sequence_features=sequence_features,
53
+ target=target,
54
+ task=self.task_type,
55
+ device=device,
56
+ embedding_l1_reg=embedding_l1_reg,
57
+ dense_l1_reg=dense_l1_reg,
58
+ embedding_l2_reg=embedding_l2_reg,
59
+ dense_l2_reg=dense_l2_reg,
60
+ early_stop_patience=20,
61
+ model_id=model_id
62
+ )
63
+
64
+ self.loss = loss
65
+ if self.loss is None:
66
+ self.loss = "bce"
67
+
68
+ self.att_layer_num = att_layer_num
69
+ self.att_embedding_dim = att_embedding_dim
70
+
71
+ # Use sparse and sequence features for interaction
72
+ self.interaction_features = sparse_features + sequence_features
73
+
74
+ # All features for embedding
75
+ self.all_features = dense_features + sparse_features + sequence_features
76
+
77
+ # Embedding layer
78
+ self.embedding = EmbeddingLayer(features=self.all_features)
79
+
80
+ # Project embeddings to attention embedding dimension
81
+ num_fields = len(self.interaction_features)
82
+ total_embedding_dim = sum([f.embedding_dim for f in self.interaction_features])
83
+
84
+ # If embeddings have different dimensions, project them to att_embedding_dim
85
+ self.need_projection = not all(f.embedding_dim == att_embedding_dim for f in self.interaction_features)
86
+ self.projection_layers = None
87
+ if self.need_projection:
88
+ self.projection_layers = nn.ModuleList([
89
+ nn.Linear(f.embedding_dim, att_embedding_dim, bias=False)
90
+ for f in self.interaction_features
91
+ ])
92
+
93
+ # Multi-head self-attention layers
94
+ self.attention_layers = nn.ModuleList([
95
+ MultiHeadSelfAttention(
96
+ embedding_dim=att_embedding_dim,
97
+ num_heads=att_head_num,
98
+ dropout=att_dropout,
99
+ use_residual=att_use_residual
100
+ ) for _ in range(att_layer_num)
101
+ ])
102
+
103
+ # Final prediction layer
104
+ self.fc = nn.Linear(num_fields * att_embedding_dim, 1)
105
+ self.prediction_layer = PredictionLayer(task_type=self.task_type)
106
+
107
+ # Register regularization weights
108
+ self._register_regularization_weights(
109
+ embedding_attr='embedding',
110
+ include_modules=['projection_layers', 'attention_layers', 'fc']
111
+ )
112
+
113
+ self.compile(
114
+ optimizer=optimizer,
115
+ optimizer_params=optimizer_params,
116
+ loss=loss
117
+ )
118
+
119
+ def forward(self, x):
120
+ # Get embeddings field-by-field so mixed dimensions can be projected safely
121
+ field_embeddings = []
122
+ if len(self.interaction_features) == 0:
123
+ raise ValueError("AutoInt requires at least one sparse or sequence feature for interactions.")
124
+ for idx, feature in enumerate(self.interaction_features):
125
+ feature_emb = self.embedding(x=x, features=[feature], squeeze_dim=False)
126
+ feature_emb = feature_emb.squeeze(1) # [B, embedding_dim]
127
+ if self.need_projection and self.projection_layers is not None:
128
+ feature_emb = self.projection_layers[idx](feature_emb)
129
+ field_embeddings.append(feature_emb.unsqueeze(1)) # [B, 1, att_embedding_dim or original_dim]
130
+ embeddings = torch.cat(field_embeddings, dim=1)
131
+
132
+ # Apply multi-head self-attention layers
133
+ attention_output = embeddings
134
+ for att_layer in self.attention_layers:
135
+ attention_output = att_layer(attention_output) # [B, num_fields, att_embedding_dim]
136
+
137
+ # Flatten and predict
138
+ attention_output_flat = attention_output.flatten(start_dim=1) # [B, num_fields * att_embedding_dim]
139
+ y = self.fc(attention_output_flat) # [B, 1]
140
+ return self.prediction_layer(y)