nextrec 0.2.6__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. nextrec/__version__.py +1 -1
  2. nextrec/basic/activation.py +4 -8
  3. nextrec/basic/callback.py +1 -1
  4. nextrec/basic/features.py +33 -25
  5. nextrec/basic/layers.py +164 -601
  6. nextrec/basic/loggers.py +3 -4
  7. nextrec/basic/metrics.py +39 -115
  8. nextrec/basic/model.py +248 -174
  9. nextrec/basic/session.py +1 -5
  10. nextrec/data/__init__.py +12 -0
  11. nextrec/data/data_utils.py +3 -27
  12. nextrec/data/dataloader.py +26 -34
  13. nextrec/data/preprocessor.py +2 -1
  14. nextrec/loss/listwise.py +6 -4
  15. nextrec/loss/loss_utils.py +10 -6
  16. nextrec/loss/pairwise.py +5 -3
  17. nextrec/loss/pointwise.py +7 -13
  18. nextrec/models/match/mind.py +110 -1
  19. nextrec/models/multi_task/esmm.py +46 -27
  20. nextrec/models/multi_task/mmoe.py +48 -30
  21. nextrec/models/multi_task/ple.py +156 -141
  22. nextrec/models/multi_task/poso.py +413 -0
  23. nextrec/models/multi_task/share_bottom.py +43 -26
  24. nextrec/models/ranking/__init__.py +2 -0
  25. nextrec/models/ranking/autoint.py +1 -1
  26. nextrec/models/ranking/dcn.py +20 -1
  27. nextrec/models/ranking/dcn_v2.py +84 -0
  28. nextrec/models/ranking/deepfm.py +44 -18
  29. nextrec/models/ranking/dien.py +130 -27
  30. nextrec/models/ranking/masknet.py +13 -67
  31. nextrec/models/ranking/widedeep.py +39 -18
  32. nextrec/models/ranking/xdeepfm.py +34 -1
  33. nextrec/utils/common.py +26 -1
  34. nextrec-0.3.1.dist-info/METADATA +306 -0
  35. nextrec-0.3.1.dist-info/RECORD +56 -0
  36. {nextrec-0.2.6.dist-info → nextrec-0.3.1.dist-info}/WHEEL +1 -1
  37. nextrec-0.2.6.dist-info/METADATA +0 -281
  38. nextrec-0.2.6.dist-info/RECORD +0 -54
  39. {nextrec-0.2.6.dist-info → nextrec-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,413 @@
1
+ """
2
+ Date: create on 28/11/2025
3
+ Author: Yang Zhou,zyaztec@gmail.com
4
+ Reference:
5
+ [1] Wang et al. "POSO: Personalized Cold Start Modules for Large-scale Recommender Systems", 2021.
6
+
7
+ POSO (Personalized cOld-start mOdules) augments backbone recommenders by injecting a
8
+ personalized cold-start vector `pc` that gates hidden units layer by layer. Each fully
9
+ connected layer or expert output is multiplied by gate(pc), letting the backbone adapt
10
+ its hidden representations to user profiles even when behavioral signals are scarce.
11
+
12
+ Core idea:
13
+ (1) A lightweight two-layer MLP maps `pc` to gate(pc) = C * sigmoid(W2 * phi(W1 * pc + b1) + b2)
14
+ (2) gate(pc) scales each hidden unit element-wise, masking or amplifying features
15
+ (3) Existing task gates/towers remain intact; POSO only overlays personalization
16
+
17
+ Key advantages:
18
+ - Plug-and-play personalization for cold-start users without redesigning the backbone
19
+ - Per-layer/expert gating with minimal additional parameters
20
+ - Compatible with plain MLP towers and MMoE structures, keeping training stable
21
+ - Works with split features: main features feed the backbone, PC features drive gates
22
+
23
+ POSO 通过个性化冷启动向量 `pc` 为推荐模型叠加逐层的门控系数,
24
+ 在每个全连接层或专家输出上乘以 gate(pc) 做元素级缩放,
25
+ 即使行为信号稀缺也能按用户画像调整隐藏表示。
26
+
27
+ 实现思路:
28
+ (1) 用轻量两层 MLP 生成 gate(pc) = C * sigmoid(W2 * phi(W1 * pc + b1) + b2)
29
+ (2) gate(pc) 对神经元逐元素放大或抑制
30
+ (3) 原有任务门/塔不变,POSO 仅叠加个性化门控
31
+
32
+ 主要优点:
33
+ - 冷启动场景的可插拔个性化,无需重做骨干结构
34
+ - 每层/每专家独立门控,新增参数量小
35
+ - 兼容 MLP、MMoE 等多任务骨干,训练过程平稳
36
+ - 主特征做建模,PC 特征驱动门控,解耦表征与个性化信号
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import torch
42
+ import torch.nn as nn
43
+ import torch.nn.functional as F
44
+
45
+ from nextrec.basic.features import DenseFeature, SequenceFeature, SparseFeature
46
+ from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
47
+ from nextrec.basic.activation import activation_layer
48
+ from nextrec.basic.model import BaseModel
49
+ from nextrec.utils.common import merge_features
50
+
51
+
52
+ class POSOGate(nn.Module):
53
+ """
54
+ Two-layer MLP that maps the personalized cold-start vector to a gate:
55
+ gate(pc) = C * sigmoid( W2 * phi(W1 * pc + b1) + b2 )
56
+ The output shares the same dimension as the hidden vector to be masked and
57
+ is applied element-wise.
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ pc_dim: int,
63
+ out_dim: int,
64
+ hidden_dim: int = 32,
65
+ scale_factor: float = 2.0,
66
+ activation: str = "relu",
67
+ ) -> None:
68
+ super().__init__()
69
+ self.fc1 = nn.Linear(pc_dim, hidden_dim)
70
+ self.fc2 = nn.Linear(hidden_dim, out_dim)
71
+ self.scale_factor = scale_factor
72
+ self.act = activation_layer(activation)
73
+
74
+ def forward(self, pc: torch.Tensor) -> torch.Tensor:
75
+ """
76
+ pc: (B, pc_dim)
77
+ return: (B, out_dim) in (0, C)
78
+ """
79
+ h = self.act(self.fc1(pc))
80
+ g = torch.sigmoid(self.fc2(h)) # (B, out_dim) in (0,1)
81
+ return self.scale_factor * g
82
+
83
+
84
+ class POSOFC(nn.Module):
85
+ """
86
+ Single POSO fully connected layer mirroring Eq. (11):
87
+ h = phi(Wx + b)
88
+ h_hat = gate(pc) ⊙ h
89
+ where gate(pc) = C * sigmoid(MLP(pc)).
90
+ """
91
+
92
+ def __init__(
93
+ self,
94
+ in_dim: int,
95
+ out_dim: int,
96
+ pc_dim: int,
97
+ gate_hidden_dim: int = 32,
98
+ scale_factor: float = 2.0,
99
+ activation: str = "relu",
100
+ use_bias: bool = True,
101
+ ) -> None:
102
+ super().__init__()
103
+ self.linear = nn.Linear(in_dim, out_dim, bias=use_bias)
104
+ self.act = activation_layer(activation)
105
+ self.gate = POSOGate(
106
+ pc_dim=pc_dim,
107
+ out_dim=out_dim,
108
+ hidden_dim=gate_hidden_dim,
109
+ scale_factor=scale_factor,
110
+ activation=activation,
111
+ )
112
+
113
+ def forward(self, x: torch.Tensor, pc: torch.Tensor) -> torch.Tensor:
114
+ """
115
+ x: (B, in_dim)
116
+ pc: (B, pc_dim)
117
+ return: (B, out_dim)
118
+ """
119
+ h = self.act(self.linear(x)) # Standard FC with activation
120
+ g = self.gate(pc) # (B, out_dim)
121
+ return g * h # Element-wise gating
122
+
123
+
124
+ class POSOMLP(nn.Module):
125
+ """
126
+ POSO-enhanced MLP that stacks multiple POSOFC layers.
127
+
128
+ dims: e.g., [256, 128, 64] means
129
+ in_dim -> 256 -> 128 -> 64
130
+ Each layer has its own gate g_l(pc) following Eq. (11).
131
+ """
132
+
133
+ def __init__(
134
+ self,
135
+ input_dim: int,
136
+ pc_dim: int,
137
+ dims: list[int],
138
+ gate_hidden_dim: int = 32,
139
+ scale_factor: float = 2.0,
140
+ activation: str = "relu",
141
+ use_bias: bool = True,
142
+ dropout: float = 0.0,
143
+ ) -> None:
144
+ super().__init__()
145
+
146
+ layers = []
147
+ in_dim = input_dim
148
+ for out_dim in dims:
149
+ layers.append(
150
+ POSOFC(
151
+ in_dim=in_dim,
152
+ out_dim=out_dim,
153
+ pc_dim=pc_dim,
154
+ gate_hidden_dim=gate_hidden_dim,
155
+ scale_factor=scale_factor,
156
+ activation=activation,
157
+ use_bias=use_bias,
158
+ )
159
+ )
160
+ in_dim = out_dim
161
+
162
+ self.layers = nn.ModuleList(layers)
163
+ self.dropout = nn.Dropout(dropout) if dropout > 0 else None
164
+
165
+ def forward(self, x: torch.Tensor, pc: torch.Tensor) -> torch.Tensor:
166
+ """
167
+ x: (B, input_dim)
168
+ pc: (B, pc_dim)
169
+ """
170
+ h = x
171
+ for layer in self.layers:
172
+ h = layer(h, pc)
173
+ if self.dropout is not None:
174
+ h = self.dropout(h)
175
+ return h
176
+
177
+
178
+ class POSOMMoE(nn.Module):
179
+ """
180
+ POSO(MMoE) mirrors Section 4.4 and Eq. (15)-(18) of the paper:
181
+ - Keep the original experts and task gates gate_t(x)
182
+ - Add a PC gate g_e(pc) for every expert_e
183
+ - Task gates aggregate the PC-masked expert outputs
184
+
185
+ Concretely:
186
+ h_e = expert_e(x) # (B, D)
187
+ g_e = POSOGate(pc) in (0, C)^{D} # (B, D)
188
+ h_e_tilde = g_e ⊙ h_e # (B, D)
189
+ z_t = Σ_e gate_t,e(x) * h_e_tilde
190
+ """
191
+
192
+ def __init__(
193
+ self,
194
+ input_dim: int,
195
+ pc_dim: int, # for poso feature dimension
196
+ num_experts: int,
197
+ expert_hidden_dims: list[int],
198
+ num_tasks: int,
199
+ activation: str = "relu",
200
+ expert_dropout: float = 0.0,
201
+ gate_hidden_dim: int = 32, # for poso gate hidden dimension
202
+ scale_factor: float = 2.0, # for poso gate scale factor
203
+ gate_use_softmax: bool = True,
204
+ ) -> None:
205
+ super().__init__()
206
+ self.num_experts = num_experts
207
+ self.num_tasks = num_tasks
208
+
209
+ # Experts built with framework MLP, same as standard MMoE
210
+ self.experts = nn.ModuleList([MLP(input_dim=input_dim, output_layer=False, dims=expert_hidden_dims, activation=activation, dropout=expert_dropout,) for _ in range(num_experts)])
211
+ self.expert_output_dim = expert_hidden_dims[-1] if expert_hidden_dims else input_dim
212
+
213
+ # Task-specific gates: gate_t(x) over experts
214
+ self.gates = nn.ModuleList([nn.Linear(input_dim, num_experts) for _ in range(num_tasks)])
215
+ self.gate_use_softmax = gate_use_softmax
216
+
217
+ # PC gate per expert: g_e(pc) ∈ R^D
218
+ self.expert_pc_gates = nn.ModuleList([POSOGate(pc_dim=pc_dim, out_dim=self.expert_output_dim, hidden_dim=gate_hidden_dim, scale_factor=scale_factor, activation=activation,) for _ in range(num_experts)])
219
+
220
+ def forward(self, x: torch.Tensor, pc: torch.Tensor) -> list[torch.Tensor]:
221
+ """
222
+ x: (B, input_dim)
223
+ pc: (B, pc_dim)
224
+ return: list of task outputs z_t with length num_tasks, each (B, D)
225
+ """
226
+ # 1) Expert outputs with POSO PC gate
227
+ masked_expert_outputs = []
228
+ for e, expert in enumerate(self.experts):
229
+ h_e = expert(x) # (B, D)
230
+ g_e = self.expert_pc_gates[e](pc) # (B, D)
231
+ h_e_tilde = g_e * h_e # (B, D)
232
+ masked_expert_outputs.append(h_e_tilde)
233
+
234
+ masked_expert_outputs = torch.stack(masked_expert_outputs, dim=1) # (B, E, D)
235
+
236
+ # 2) Task gates depend on x as in standard MMoE
237
+ task_outputs: list[torch.Tensor] = []
238
+ for t in range(self.num_tasks):
239
+ logits = self.gates[t](x) # (B, E)
240
+ if self.gate_use_softmax:
241
+ gate = F.softmax(logits, dim=1)
242
+ else:
243
+ gate = logits
244
+
245
+ gate = gate.unsqueeze(-1) # (B, E, 1)
246
+ z_t = torch.sum(gate * masked_expert_outputs, dim=1) # (B, D)
247
+ task_outputs.append(z_t)
248
+
249
+ return task_outputs
250
+
251
+
252
+ class POSO(BaseModel):
253
+ """
254
+ POSO model implemented with the NextRec framework. It supports two backbones:
255
+ - "mlp": per-task POSO-MLP towers with PC gating on every hidden layer
256
+ - "mmoe": POSO-gated MMoE experts plus task-specific towers
257
+ """
258
+
259
+ @property
260
+ def model_name(self) -> str:
261
+ return "POSO"
262
+
263
+ @property
264
+ def task_type(self) -> list[str]:
265
+ return self.task if isinstance(self.task, list) else [self.task]
266
+
267
+ def __init__(
268
+ self,
269
+ main_dense_features: list[DenseFeature] | None,
270
+ main_sparse_features: list[SparseFeature] | None,
271
+ main_sequence_features: list[SequenceFeature] | None,
272
+ pc_dense_features: list[DenseFeature] | None,
273
+ pc_sparse_features: list[SparseFeature] | None,
274
+ pc_sequence_features: list[SequenceFeature] | None,
275
+ tower_params_list: list[dict],
276
+ target: list[str],
277
+ task: str | list[str] = "binary",
278
+ architecture: str = "mlp",
279
+ # POSO gating defaults
280
+ gate_hidden_dim: int = 32,
281
+ gate_scale_factor: float = 2.0,
282
+ gate_activation: str = "relu",
283
+ gate_use_bias: bool = True,
284
+ # MMoE-specific params
285
+ num_experts: int = 4,
286
+ expert_hidden_dims: list[int] | None = None,
287
+ expert_activation: str = "relu",
288
+ expert_dropout: float = 0.0,
289
+ expert_gate_hidden_dim: int = 32,
290
+ expert_gate_scale_factor: float = 2.0,
291
+ gate_use_softmax: bool = True,
292
+ optimizer: str = "adam",
293
+ optimizer_params: dict | None = None,
294
+ loss: str | nn.Module | list[str | nn.Module] | None = "bce",
295
+ loss_params: dict | list[dict] | None = None,
296
+ device: str = "cpu",
297
+ embedding_l1_reg: float = 1e-6,
298
+ dense_l1_reg: float = 1e-5,
299
+ embedding_l2_reg: float = 1e-5,
300
+ dense_l2_reg: float = 1e-4,
301
+ **kwargs,
302
+ ):
303
+ # Keep explicit copies of main and PC features
304
+ self.main_dense_features = list(main_dense_features or [])
305
+ self.main_sparse_features = list(main_sparse_features or [])
306
+ self.main_sequence_features = list(main_sequence_features or [])
307
+ self.pc_dense_features = list(pc_dense_features or [])
308
+ self.pc_sparse_features = list(pc_sparse_features or [])
309
+ self.pc_sequence_features = list(pc_sequence_features or [])
310
+
311
+ if not self.pc_dense_features and not self.pc_sparse_features and not self.pc_sequence_features:
312
+ raise ValueError("POSO requires at least one PC feature for personalization.")
313
+
314
+ dense_features = merge_features(self.main_dense_features, self.pc_dense_features)
315
+ sparse_features = merge_features(self.main_sparse_features, self.pc_sparse_features)
316
+ sequence_features = merge_features(self.main_sequence_features, self.pc_sequence_features)
317
+
318
+ super().__init__(
319
+ dense_features=dense_features,
320
+ sparse_features=sparse_features,
321
+ sequence_features=sequence_features,
322
+ target=target,
323
+ task=task,
324
+ device=device,
325
+ embedding_l1_reg=embedding_l1_reg,
326
+ dense_l1_reg=dense_l1_reg,
327
+ embedding_l2_reg=embedding_l2_reg,
328
+ dense_l2_reg=dense_l2_reg,
329
+ early_stop_patience=20,
330
+ **kwargs,
331
+ )
332
+
333
+ self.loss = loss if loss is not None else "bce"
334
+ optimizer_params = optimizer_params or {}
335
+
336
+ self.num_tasks = len(target)
337
+ if len(tower_params_list) != self.num_tasks:
338
+ raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
339
+
340
+ self.main_features = self.main_dense_features + self.main_sparse_features + self.main_sequence_features
341
+ self.pc_features = self.pc_dense_features + self.pc_sparse_features + self.pc_sequence_features
342
+
343
+ self.embedding = EmbeddingLayer(features=self.all_features)
344
+ self.main_input_dim = self.embedding.get_input_dim(self.main_features)
345
+ self.pc_input_dim = self.embedding.get_input_dim(self.pc_features)
346
+
347
+ self.architecture = architecture.lower()
348
+ if self.architecture not in {"mlp", "mmoe"}:
349
+ raise ValueError(f"Unsupported architecture '{architecture}', choose from ['mlp', 'mmoe'].")
350
+
351
+ # Build backbones
352
+ if self.architecture == "mlp":
353
+ self.towers = nn.ModuleList()
354
+ self.tower_heads = nn.ModuleList()
355
+ for tower_params in tower_params_list:
356
+ dims = tower_params.get("dims")
357
+ if not dims:
358
+ raise ValueError("tower_params must include a non-empty 'dims' list for POSO-MLP towers.")
359
+ dropout = tower_params.get("dropout", 0.0)
360
+ tower = POSOMLP(
361
+ input_dim=self.main_input_dim,
362
+ pc_dim=self.pc_input_dim,
363
+ dims=dims,
364
+ gate_hidden_dim=tower_params.get("gate_hidden_dim", gate_hidden_dim),
365
+ scale_factor=tower_params.get("scale_factor", gate_scale_factor),
366
+ activation=tower_params.get("activation", gate_activation),
367
+ use_bias=tower_params.get("use_bias", gate_use_bias),
368
+ dropout=dropout,
369
+ )
370
+ self.towers.append(tower)
371
+ tower_output_dim = dims[-1] if dims else self.main_input_dim
372
+ self.tower_heads.append(nn.Linear(tower_output_dim, 1))
373
+ else:
374
+ if expert_hidden_dims is None or not expert_hidden_dims:
375
+ raise ValueError("expert_hidden_dims must be provided for MMoE architecture.")
376
+ self.mmoe = POSOMMoE(
377
+ input_dim=self.main_input_dim,
378
+ pc_dim=self.pc_input_dim,
379
+ num_experts=num_experts,
380
+ expert_hidden_dims=expert_hidden_dims,
381
+ num_tasks=self.num_tasks,
382
+ activation=expert_activation,
383
+ expert_dropout=expert_dropout,
384
+ gate_hidden_dim=expert_gate_hidden_dim,
385
+ scale_factor=expert_gate_scale_factor,
386
+ gate_use_softmax=gate_use_softmax,
387
+ )
388
+ self.towers = nn.ModuleList([MLP(input_dim=self.mmoe.expert_output_dim, output_layer=True, **tower_params,) for tower_params in tower_params_list])
389
+ self.tower_heads = None
390
+ self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1] * self.num_tasks,)
391
+ include_modules = ["towers", "tower_heads"] if self.architecture == "mlp" else ["mmoe", "towers"]
392
+ self._register_regularization_weights(embedding_attr="embedding", include_modules=include_modules)
393
+ self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
394
+
395
+ def forward(self, x):
396
+ # Embed main and PC features separately so PC can gate hidden units
397
+ main_input = self.embedding(x=x, features=self.main_features, squeeze_dim=True)
398
+ pc_input = self.embedding(x=x, features=self.pc_features, squeeze_dim=True)
399
+
400
+ task_outputs = []
401
+ if self.architecture == "mlp":
402
+ for tower, head in zip(self.towers, self.tower_heads):
403
+ hidden = tower(main_input, pc_input)
404
+ logit = head(hidden)
405
+ task_outputs.append(logit)
406
+ else:
407
+ expert_outputs = self.mmoe(main_input, pc_input)
408
+ for idx, tower in enumerate(self.towers):
409
+ logit = tower(expert_outputs[idx])
410
+ task_outputs.append(logit)
411
+
412
+ y = torch.cat(task_outputs, dim=1)
413
+ return self.prediction_layer(y)
@@ -1,7 +1,42 @@
1
1
  """
2
2
  Date: create on 09/11/2025
3
+ Checkpoint: edit on 24/11/2025
3
4
  Author: Yang Zhou,zyaztec@gmail.com
4
- Reference: [1] Caruana R. Multitask learning[J]. Machine learning, 1997, 28: 41-75.
5
+ Reference:
6
+ [1] Caruana R. Multitask learning[J]. Machine Learning, 1997, 28: 41-75.
7
+ (https://link.springer.com/article/10.1023/A:1007379606734)
8
+
9
+ Shared-Bottom is the classic hard-parameter-sharing baseline for multi-task learning.
10
+ All tasks share a common bottom network to learn general representations, and each
11
+ task has its own tower head for task-specific refinement and prediction. This
12
+ architecture is simple, parameter-efficient, and helps regularize related tasks.
13
+
14
+ Workflow:
15
+ (1) Unified embeddings convert dense/sparse/sequence features
16
+ (2) A shared bottom MLP learns common representations
17
+ (3) Task-specific towers further transform the shared features
18
+ (4) Separate prediction heads output each task’s logits/probabilities
19
+
20
+ Key Advantages:
21
+ - Strong inductive bias via hard parameter sharing, reducing overfitting
22
+ - Parameter-efficient compared to duplicating full models per task
23
+ - Easy to extend to many tasks with small incremental cost
24
+ - Serves as a stable baseline for evaluating advanced MTL architectures
25
+
26
+ Share-Bottom(硬共享底层)是多任务学习的经典基线:所有任务共享一个底层网络,
27
+ 各任务拥有独立塔头进行细化与预测,简单高效且能通过共享正则化相关任务。
28
+
29
+ 流程:
30
+ (1) 统一 embedding 处理稠密、稀疏与序列特征
31
+ (2) 共享底层 MLP 学习通用表示
32
+ (3) 任务塔在共享表示上做任务特定变换
33
+ (4) 各任务预测头输出对应结果
34
+
35
+ 主要优点:
36
+ - 硬参数共享提供强正则,减少过拟合
37
+ - 相比单独模型更节省参数与计算
38
+ - 易于扩展到多任务,增量开销小
39
+ - 是评估更复杂 MTL 结构的稳健基线
5
40
  """
6
41
 
7
42
  import torch
@@ -59,22 +94,17 @@ class ShareBottom(BaseModel):
59
94
  self.loss = loss
60
95
  if self.loss is None:
61
96
  self.loss = "bce"
62
-
63
97
  # Number of tasks
64
98
  self.num_tasks = len(target)
65
99
  if len(tower_params_list) != self.num_tasks:
66
100
  raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
67
-
68
- # All features
69
- self.all_features = dense_features + sparse_features + sequence_features
70
-
71
101
  # Embedding layer
72
102
  self.embedding = EmbeddingLayer(features=self.all_features)
73
-
74
103
  # Calculate input dimension
75
- emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
76
- dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
77
- input_dim = emb_dim_total + dense_input_dim
104
+ input_dim = self.embedding.input_dim
105
+ # emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
106
+ # dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
107
+ # input_dim = emb_dim_total + dense_input_dim
78
108
 
79
109
  # Shared bottom network
80
110
  self.bottom = MLP(input_dim=input_dim, output_layer=False, **bottom_params)
@@ -90,23 +120,10 @@ class ShareBottom(BaseModel):
90
120
  for tower_params in tower_params_list:
91
121
  tower = MLP(input_dim=bottom_output_dim, output_layer=True, **tower_params)
92
122
  self.towers.append(tower)
93
- self.prediction_layer = PredictionLayer(
94
- task_type=self.task_type,
95
- task_dims=[1] * self.num_tasks
96
- )
97
-
123
+ self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1] * self.num_tasks)
98
124
  # Register regularization weights
99
- self._register_regularization_weights(
100
- embedding_attr='embedding',
101
- include_modules=['bottom', 'towers']
102
- )
103
-
104
- self.compile(
105
- optimizer=optimizer,
106
- optimizer_params=optimizer_params,
107
- loss=loss,
108
- loss_params=loss_params,
109
- )
125
+ self._register_regularization_weights(embedding_attr='embedding', include_modules=['bottom', 'towers'])
126
+ self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
110
127
 
111
128
  def forward(self, x):
112
129
  # Get all embeddings and flatten
@@ -7,6 +7,7 @@ from .autoint import AutoInt
7
7
  from .widedeep import WideDeep
8
8
  from .xdeepfm import xDeepFM
9
9
  from .dcn import DCN
10
+ from .fibinet import FiBiNET
10
11
  from .din import DIN
11
12
  from .dien import DIEN
12
13
 
@@ -22,4 +23,5 @@ __all__ = [
22
23
  'AFM',
23
24
  'MaskNet',
24
25
  'PNN',
26
+ 'FiBiNET',
25
27
  ]
@@ -115,7 +115,7 @@ class AutoInt(BaseModel):
115
115
  self.att_embedding_dim = att_embedding_dim
116
116
 
117
117
  # Use sparse and sequence features for interaction
118
- self.interaction_features = sparse_features + sequence_features
118
+ self.interaction_features = dense_features + sparse_features + sequence_features
119
119
 
120
120
  # All features for embedding
121
121
  self.all_features = dense_features + sparse_features + sequence_features
@@ -12,9 +12,28 @@ import torch
12
12
  import torch.nn as nn
13
13
 
14
14
  from nextrec.basic.model import BaseModel
15
- from nextrec.basic.layers import EmbeddingLayer, MLP, CrossNetwork, PredictionLayer
15
+ from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
16
16
  from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
17
17
 
18
+ class CrossNetwork(nn.Module):
19
+ """Stacked Cross Layers from DCN (Wang et al., 2017)."""
20
+
21
+ def __init__(self, input_dim, num_layers):
22
+ super().__init__()
23
+ self.num_layers = num_layers
24
+ self.w = torch.nn.ModuleList([torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)])
25
+ self.b = torch.nn.ParameterList([torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)])
26
+
27
+ def forward(self, x):
28
+ """
29
+ :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
30
+ """
31
+ x0 = x
32
+ for i in range(self.num_layers):
33
+ xw = self.w[i](x)
34
+ x = x0 * xw + self.b[i] + x
35
+ return x
36
+
18
37
 
19
38
  class DCN(BaseModel):
20
39
  @property
@@ -0,0 +1,84 @@
1
+ """
2
+ Date: create on 09/11/2025
3
+ """
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+ from nextrec.basic.model import BaseModel
9
+ from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
10
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
11
+
12
+ class CrossNetV2(nn.Module):
13
+ """Vector-wise cross network proposed in DCN V2 (Wang et al., 2021)."""
14
+ def __init__(self, input_dim, num_layers):
15
+ super().__init__()
16
+ self.num_layers = num_layers
17
+ self.w = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim, bias=False) for _ in range(num_layers)])
18
+ self.b = torch.nn.ParameterList([torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)])
19
+
20
+
21
+ def forward(self, x):
22
+ x0 = x
23
+ for i in range(self.num_layers):
24
+ x =x0*self.w[i](x) + self.b[i] + x
25
+ return x
26
+
27
+ class CrossNetMix(nn.Module):
28
+ """Mixture of low-rank cross experts from DCN V2 (Wang et al., 2021)."""
29
+
30
+ def __init__(self, input_dim, num_layers=2, low_rank=32, num_experts=4):
31
+ super(CrossNetMix, self).__init__()
32
+ self.num_layers = num_layers
33
+ self.num_experts = num_experts
34
+
35
+ # U: (input_dim, low_rank)
36
+ self.u_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
37
+ torch.empty(num_experts, input_dim, low_rank))) for i in range(self.num_layers)])
38
+ # V: (input_dim, low_rank)
39
+ self.v_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
40
+ torch.empty(num_experts, input_dim, low_rank))) for i in range(self.num_layers)])
41
+ # C: (low_rank, low_rank)
42
+ self.c_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
43
+ torch.empty(num_experts, low_rank, low_rank))) for i in range(self.num_layers)])
44
+ self.gating = nn.ModuleList([nn.Linear(input_dim, 1, bias=False) for i in range(self.num_experts)])
45
+
46
+ self.bias = torch.nn.ParameterList([nn.Parameter(nn.init.zeros_(
47
+ torch.empty(input_dim, 1))) for i in range(self.num_layers)])
48
+
49
+ def forward(self, x):
50
+ x_0 = x.unsqueeze(2) # (bs, in_features, 1)
51
+ x_l = x_0
52
+ for i in range(self.num_layers):
53
+ output_of_experts = []
54
+ gating_score_experts = []
55
+ for expert_id in range(self.num_experts):
56
+ # (1) G(x_l)
57
+ # compute the gating score by x_l
58
+ gating_score_experts.append(self.gating[expert_id](x_l.squeeze(2)))
59
+
60
+ # (2) E(x_l)
61
+ # project the input x_l to $\mathbb{R}^{r}$
62
+ v_x = torch.matmul(self.v_list[i][expert_id].t(), x_l) # (bs, low_rank, 1)
63
+
64
+ # nonlinear activation in low rank space
65
+ v_x = torch.tanh(v_x)
66
+ v_x = torch.matmul(self.c_list[i][expert_id], v_x)
67
+ v_x = torch.tanh(v_x)
68
+
69
+ # project back to $\mathbb{R}^{d}$
70
+ uv_x = torch.matmul(self.u_list[i][expert_id], v_x) # (bs, in_features, 1)
71
+
72
+ dot_ = uv_x + self.bias[i]
73
+ dot_ = x_0 * dot_ # Hadamard-product
74
+
75
+ output_of_experts.append(dot_.squeeze(2))
76
+
77
+ # (3) mixture of low-rank experts
78
+ output_of_experts = torch.stack(output_of_experts, 2) # (bs, in_features, num_experts)
79
+ gating_score_experts = torch.stack(gating_score_experts, 1) # (bs, num_experts, 1)
80
+ moe_out = torch.matmul(output_of_experts, gating_score_experts.softmax(1))
81
+ x_l = moe_out + x_l # (bs, in_features, 1)
82
+
83
+ x_l = x_l.squeeze() # (bs, in_features)
84
+ return x_l