nextrec 0.2.7__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +4 -8
- nextrec/basic/callback.py +1 -1
- nextrec/basic/features.py +33 -25
- nextrec/basic/layers.py +164 -601
- nextrec/basic/loggers.py +3 -4
- nextrec/basic/metrics.py +39 -115
- nextrec/basic/model.py +248 -174
- nextrec/basic/session.py +1 -5
- nextrec/data/__init__.py +12 -0
- nextrec/data/data_utils.py +3 -27
- nextrec/data/dataloader.py +26 -34
- nextrec/data/preprocessor.py +2 -1
- nextrec/loss/listwise.py +6 -4
- nextrec/loss/loss_utils.py +10 -6
- nextrec/loss/pairwise.py +5 -3
- nextrec/loss/pointwise.py +7 -13
- nextrec/models/match/mind.py +110 -1
- nextrec/models/multi_task/esmm.py +46 -27
- nextrec/models/multi_task/mmoe.py +48 -30
- nextrec/models/multi_task/ple.py +156 -141
- nextrec/models/multi_task/poso.py +413 -0
- nextrec/models/multi_task/share_bottom.py +43 -26
- nextrec/models/ranking/__init__.py +2 -0
- nextrec/models/ranking/dcn.py +20 -1
- nextrec/models/ranking/dcn_v2.py +84 -0
- nextrec/models/ranking/deepfm.py +44 -18
- nextrec/models/ranking/dien.py +130 -27
- nextrec/models/ranking/masknet.py +13 -67
- nextrec/models/ranking/widedeep.py +39 -18
- nextrec/models/ranking/xdeepfm.py +34 -1
- nextrec/utils/common.py +26 -1
- nextrec-0.3.1.dist-info/METADATA +306 -0
- nextrec-0.3.1.dist-info/RECORD +56 -0
- nextrec-0.2.7.dist-info/METADATA +0 -281
- nextrec-0.2.7.dist-info/RECORD +0 -54
- {nextrec-0.2.7.dist-info → nextrec-0.3.1.dist-info}/WHEEL +0 -0
- {nextrec-0.2.7.dist-info → nextrec-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Date: create on 28/11/2025
|
|
3
|
+
Author: Yang Zhou,zyaztec@gmail.com
|
|
4
|
+
Reference:
|
|
5
|
+
[1] Wang et al. "POSO: Personalized Cold Start Modules for Large-scale Recommender Systems", 2021.
|
|
6
|
+
|
|
7
|
+
POSO (Personalized cOld-start mOdules) augments backbone recommenders by injecting a
|
|
8
|
+
personalized cold-start vector `pc` that gates hidden units layer by layer. Each fully
|
|
9
|
+
connected layer or expert output is multiplied by gate(pc), letting the backbone adapt
|
|
10
|
+
its hidden representations to user profiles even when behavioral signals are scarce.
|
|
11
|
+
|
|
12
|
+
Core idea:
|
|
13
|
+
(1) A lightweight two-layer MLP maps `pc` to gate(pc) = C * sigmoid(W2 * phi(W1 * pc + b1) + b2)
|
|
14
|
+
(2) gate(pc) scales each hidden unit element-wise, masking or amplifying features
|
|
15
|
+
(3) Existing task gates/towers remain intact; POSO only overlays personalization
|
|
16
|
+
|
|
17
|
+
Key advantages:
|
|
18
|
+
- Plug-and-play personalization for cold-start users without redesigning the backbone
|
|
19
|
+
- Per-layer/expert gating with minimal additional parameters
|
|
20
|
+
- Compatible with plain MLP towers and MMoE structures, keeping training stable
|
|
21
|
+
- Works with split features: main features feed the backbone, PC features drive gates
|
|
22
|
+
|
|
23
|
+
POSO 通过个性化冷启动向量 `pc` 为推荐模型叠加逐层的门控系数,
|
|
24
|
+
在每个全连接层或专家输出上乘以 gate(pc) 做元素级缩放,
|
|
25
|
+
即使行为信号稀缺也能按用户画像调整隐藏表示。
|
|
26
|
+
|
|
27
|
+
实现思路:
|
|
28
|
+
(1) 用轻量两层 MLP 生成 gate(pc) = C * sigmoid(W2 * phi(W1 * pc + b1) + b2)
|
|
29
|
+
(2) gate(pc) 对神经元逐元素放大或抑制
|
|
30
|
+
(3) 原有任务门/塔不变,POSO 仅叠加个性化门控
|
|
31
|
+
|
|
32
|
+
主要优点:
|
|
33
|
+
- 冷启动场景的可插拔个性化,无需重做骨干结构
|
|
34
|
+
- 每层/每专家独立门控,新增参数量小
|
|
35
|
+
- 兼容 MLP、MMoE 等多任务骨干,训练过程平稳
|
|
36
|
+
- 主特征做建模,PC 特征驱动门控,解耦表征与个性化信号
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
import torch
|
|
42
|
+
import torch.nn as nn
|
|
43
|
+
import torch.nn.functional as F
|
|
44
|
+
|
|
45
|
+
from nextrec.basic.features import DenseFeature, SequenceFeature, SparseFeature
|
|
46
|
+
from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
|
|
47
|
+
from nextrec.basic.activation import activation_layer
|
|
48
|
+
from nextrec.basic.model import BaseModel
|
|
49
|
+
from nextrec.utils.common import merge_features
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class POSOGate(nn.Module):
|
|
53
|
+
"""
|
|
54
|
+
Two-layer MLP that maps the personalized cold-start vector to a gate:
|
|
55
|
+
gate(pc) = C * sigmoid( W2 * phi(W1 * pc + b1) + b2 )
|
|
56
|
+
The output shares the same dimension as the hidden vector to be masked and
|
|
57
|
+
is applied element-wise.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
pc_dim: int,
|
|
63
|
+
out_dim: int,
|
|
64
|
+
hidden_dim: int = 32,
|
|
65
|
+
scale_factor: float = 2.0,
|
|
66
|
+
activation: str = "relu",
|
|
67
|
+
) -> None:
|
|
68
|
+
super().__init__()
|
|
69
|
+
self.fc1 = nn.Linear(pc_dim, hidden_dim)
|
|
70
|
+
self.fc2 = nn.Linear(hidden_dim, out_dim)
|
|
71
|
+
self.scale_factor = scale_factor
|
|
72
|
+
self.act = activation_layer(activation)
|
|
73
|
+
|
|
74
|
+
def forward(self, pc: torch.Tensor) -> torch.Tensor:
|
|
75
|
+
"""
|
|
76
|
+
pc: (B, pc_dim)
|
|
77
|
+
return: (B, out_dim) in (0, C)
|
|
78
|
+
"""
|
|
79
|
+
h = self.act(self.fc1(pc))
|
|
80
|
+
g = torch.sigmoid(self.fc2(h)) # (B, out_dim) in (0,1)
|
|
81
|
+
return self.scale_factor * g
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class POSOFC(nn.Module):
|
|
85
|
+
"""
|
|
86
|
+
Single POSO fully connected layer mirroring Eq. (11):
|
|
87
|
+
h = phi(Wx + b)
|
|
88
|
+
h_hat = gate(pc) ⊙ h
|
|
89
|
+
where gate(pc) = C * sigmoid(MLP(pc)).
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
def __init__(
|
|
93
|
+
self,
|
|
94
|
+
in_dim: int,
|
|
95
|
+
out_dim: int,
|
|
96
|
+
pc_dim: int,
|
|
97
|
+
gate_hidden_dim: int = 32,
|
|
98
|
+
scale_factor: float = 2.0,
|
|
99
|
+
activation: str = "relu",
|
|
100
|
+
use_bias: bool = True,
|
|
101
|
+
) -> None:
|
|
102
|
+
super().__init__()
|
|
103
|
+
self.linear = nn.Linear(in_dim, out_dim, bias=use_bias)
|
|
104
|
+
self.act = activation_layer(activation)
|
|
105
|
+
self.gate = POSOGate(
|
|
106
|
+
pc_dim=pc_dim,
|
|
107
|
+
out_dim=out_dim,
|
|
108
|
+
hidden_dim=gate_hidden_dim,
|
|
109
|
+
scale_factor=scale_factor,
|
|
110
|
+
activation=activation,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def forward(self, x: torch.Tensor, pc: torch.Tensor) -> torch.Tensor:
|
|
114
|
+
"""
|
|
115
|
+
x: (B, in_dim)
|
|
116
|
+
pc: (B, pc_dim)
|
|
117
|
+
return: (B, out_dim)
|
|
118
|
+
"""
|
|
119
|
+
h = self.act(self.linear(x)) # Standard FC with activation
|
|
120
|
+
g = self.gate(pc) # (B, out_dim)
|
|
121
|
+
return g * h # Element-wise gating
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class POSOMLP(nn.Module):
|
|
125
|
+
"""
|
|
126
|
+
POSO-enhanced MLP that stacks multiple POSOFC layers.
|
|
127
|
+
|
|
128
|
+
dims: e.g., [256, 128, 64] means
|
|
129
|
+
in_dim -> 256 -> 128 -> 64
|
|
130
|
+
Each layer has its own gate g_l(pc) following Eq. (11).
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
def __init__(
|
|
134
|
+
self,
|
|
135
|
+
input_dim: int,
|
|
136
|
+
pc_dim: int,
|
|
137
|
+
dims: list[int],
|
|
138
|
+
gate_hidden_dim: int = 32,
|
|
139
|
+
scale_factor: float = 2.0,
|
|
140
|
+
activation: str = "relu",
|
|
141
|
+
use_bias: bool = True,
|
|
142
|
+
dropout: float = 0.0,
|
|
143
|
+
) -> None:
|
|
144
|
+
super().__init__()
|
|
145
|
+
|
|
146
|
+
layers = []
|
|
147
|
+
in_dim = input_dim
|
|
148
|
+
for out_dim in dims:
|
|
149
|
+
layers.append(
|
|
150
|
+
POSOFC(
|
|
151
|
+
in_dim=in_dim,
|
|
152
|
+
out_dim=out_dim,
|
|
153
|
+
pc_dim=pc_dim,
|
|
154
|
+
gate_hidden_dim=gate_hidden_dim,
|
|
155
|
+
scale_factor=scale_factor,
|
|
156
|
+
activation=activation,
|
|
157
|
+
use_bias=use_bias,
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
in_dim = out_dim
|
|
161
|
+
|
|
162
|
+
self.layers = nn.ModuleList(layers)
|
|
163
|
+
self.dropout = nn.Dropout(dropout) if dropout > 0 else None
|
|
164
|
+
|
|
165
|
+
def forward(self, x: torch.Tensor, pc: torch.Tensor) -> torch.Tensor:
|
|
166
|
+
"""
|
|
167
|
+
x: (B, input_dim)
|
|
168
|
+
pc: (B, pc_dim)
|
|
169
|
+
"""
|
|
170
|
+
h = x
|
|
171
|
+
for layer in self.layers:
|
|
172
|
+
h = layer(h, pc)
|
|
173
|
+
if self.dropout is not None:
|
|
174
|
+
h = self.dropout(h)
|
|
175
|
+
return h
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class POSOMMoE(nn.Module):
|
|
179
|
+
"""
|
|
180
|
+
POSO(MMoE) mirrors Section 4.4 and Eq. (15)-(18) of the paper:
|
|
181
|
+
- Keep the original experts and task gates gate_t(x)
|
|
182
|
+
- Add a PC gate g_e(pc) for every expert_e
|
|
183
|
+
- Task gates aggregate the PC-masked expert outputs
|
|
184
|
+
|
|
185
|
+
Concretely:
|
|
186
|
+
h_e = expert_e(x) # (B, D)
|
|
187
|
+
g_e = POSOGate(pc) in (0, C)^{D} # (B, D)
|
|
188
|
+
h_e_tilde = g_e ⊙ h_e # (B, D)
|
|
189
|
+
z_t = Σ_e gate_t,e(x) * h_e_tilde
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
def __init__(
|
|
193
|
+
self,
|
|
194
|
+
input_dim: int,
|
|
195
|
+
pc_dim: int, # for poso feature dimension
|
|
196
|
+
num_experts: int,
|
|
197
|
+
expert_hidden_dims: list[int],
|
|
198
|
+
num_tasks: int,
|
|
199
|
+
activation: str = "relu",
|
|
200
|
+
expert_dropout: float = 0.0,
|
|
201
|
+
gate_hidden_dim: int = 32, # for poso gate hidden dimension
|
|
202
|
+
scale_factor: float = 2.0, # for poso gate scale factor
|
|
203
|
+
gate_use_softmax: bool = True,
|
|
204
|
+
) -> None:
|
|
205
|
+
super().__init__()
|
|
206
|
+
self.num_experts = num_experts
|
|
207
|
+
self.num_tasks = num_tasks
|
|
208
|
+
|
|
209
|
+
# Experts built with framework MLP, same as standard MMoE
|
|
210
|
+
self.experts = nn.ModuleList([MLP(input_dim=input_dim, output_layer=False, dims=expert_hidden_dims, activation=activation, dropout=expert_dropout,) for _ in range(num_experts)])
|
|
211
|
+
self.expert_output_dim = expert_hidden_dims[-1] if expert_hidden_dims else input_dim
|
|
212
|
+
|
|
213
|
+
# Task-specific gates: gate_t(x) over experts
|
|
214
|
+
self.gates = nn.ModuleList([nn.Linear(input_dim, num_experts) for _ in range(num_tasks)])
|
|
215
|
+
self.gate_use_softmax = gate_use_softmax
|
|
216
|
+
|
|
217
|
+
# PC gate per expert: g_e(pc) ∈ R^D
|
|
218
|
+
self.expert_pc_gates = nn.ModuleList([POSOGate(pc_dim=pc_dim, out_dim=self.expert_output_dim, hidden_dim=gate_hidden_dim, scale_factor=scale_factor, activation=activation,) for _ in range(num_experts)])
|
|
219
|
+
|
|
220
|
+
def forward(self, x: torch.Tensor, pc: torch.Tensor) -> list[torch.Tensor]:
|
|
221
|
+
"""
|
|
222
|
+
x: (B, input_dim)
|
|
223
|
+
pc: (B, pc_dim)
|
|
224
|
+
return: list of task outputs z_t with length num_tasks, each (B, D)
|
|
225
|
+
"""
|
|
226
|
+
# 1) Expert outputs with POSO PC gate
|
|
227
|
+
masked_expert_outputs = []
|
|
228
|
+
for e, expert in enumerate(self.experts):
|
|
229
|
+
h_e = expert(x) # (B, D)
|
|
230
|
+
g_e = self.expert_pc_gates[e](pc) # (B, D)
|
|
231
|
+
h_e_tilde = g_e * h_e # (B, D)
|
|
232
|
+
masked_expert_outputs.append(h_e_tilde)
|
|
233
|
+
|
|
234
|
+
masked_expert_outputs = torch.stack(masked_expert_outputs, dim=1) # (B, E, D)
|
|
235
|
+
|
|
236
|
+
# 2) Task gates depend on x as in standard MMoE
|
|
237
|
+
task_outputs: list[torch.Tensor] = []
|
|
238
|
+
for t in range(self.num_tasks):
|
|
239
|
+
logits = self.gates[t](x) # (B, E)
|
|
240
|
+
if self.gate_use_softmax:
|
|
241
|
+
gate = F.softmax(logits, dim=1)
|
|
242
|
+
else:
|
|
243
|
+
gate = logits
|
|
244
|
+
|
|
245
|
+
gate = gate.unsqueeze(-1) # (B, E, 1)
|
|
246
|
+
z_t = torch.sum(gate * masked_expert_outputs, dim=1) # (B, D)
|
|
247
|
+
task_outputs.append(z_t)
|
|
248
|
+
|
|
249
|
+
return task_outputs
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class POSO(BaseModel):
|
|
253
|
+
"""
|
|
254
|
+
POSO model implemented with the NextRec framework. It supports two backbones:
|
|
255
|
+
- "mlp": per-task POSO-MLP towers with PC gating on every hidden layer
|
|
256
|
+
- "mmoe": POSO-gated MMoE experts plus task-specific towers
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
@property
|
|
260
|
+
def model_name(self) -> str:
|
|
261
|
+
return "POSO"
|
|
262
|
+
|
|
263
|
+
@property
|
|
264
|
+
def task_type(self) -> list[str]:
|
|
265
|
+
return self.task if isinstance(self.task, list) else [self.task]
|
|
266
|
+
|
|
267
|
+
def __init__(
|
|
268
|
+
self,
|
|
269
|
+
main_dense_features: list[DenseFeature] | None,
|
|
270
|
+
main_sparse_features: list[SparseFeature] | None,
|
|
271
|
+
main_sequence_features: list[SequenceFeature] | None,
|
|
272
|
+
pc_dense_features: list[DenseFeature] | None,
|
|
273
|
+
pc_sparse_features: list[SparseFeature] | None,
|
|
274
|
+
pc_sequence_features: list[SequenceFeature] | None,
|
|
275
|
+
tower_params_list: list[dict],
|
|
276
|
+
target: list[str],
|
|
277
|
+
task: str | list[str] = "binary",
|
|
278
|
+
architecture: str = "mlp",
|
|
279
|
+
# POSO gating defaults
|
|
280
|
+
gate_hidden_dim: int = 32,
|
|
281
|
+
gate_scale_factor: float = 2.0,
|
|
282
|
+
gate_activation: str = "relu",
|
|
283
|
+
gate_use_bias: bool = True,
|
|
284
|
+
# MMoE-specific params
|
|
285
|
+
num_experts: int = 4,
|
|
286
|
+
expert_hidden_dims: list[int] | None = None,
|
|
287
|
+
expert_activation: str = "relu",
|
|
288
|
+
expert_dropout: float = 0.0,
|
|
289
|
+
expert_gate_hidden_dim: int = 32,
|
|
290
|
+
expert_gate_scale_factor: float = 2.0,
|
|
291
|
+
gate_use_softmax: bool = True,
|
|
292
|
+
optimizer: str = "adam",
|
|
293
|
+
optimizer_params: dict | None = None,
|
|
294
|
+
loss: str | nn.Module | list[str | nn.Module] | None = "bce",
|
|
295
|
+
loss_params: dict | list[dict] | None = None,
|
|
296
|
+
device: str = "cpu",
|
|
297
|
+
embedding_l1_reg: float = 1e-6,
|
|
298
|
+
dense_l1_reg: float = 1e-5,
|
|
299
|
+
embedding_l2_reg: float = 1e-5,
|
|
300
|
+
dense_l2_reg: float = 1e-4,
|
|
301
|
+
**kwargs,
|
|
302
|
+
):
|
|
303
|
+
# Keep explicit copies of main and PC features
|
|
304
|
+
self.main_dense_features = list(main_dense_features or [])
|
|
305
|
+
self.main_sparse_features = list(main_sparse_features or [])
|
|
306
|
+
self.main_sequence_features = list(main_sequence_features or [])
|
|
307
|
+
self.pc_dense_features = list(pc_dense_features or [])
|
|
308
|
+
self.pc_sparse_features = list(pc_sparse_features or [])
|
|
309
|
+
self.pc_sequence_features = list(pc_sequence_features or [])
|
|
310
|
+
|
|
311
|
+
if not self.pc_dense_features and not self.pc_sparse_features and not self.pc_sequence_features:
|
|
312
|
+
raise ValueError("POSO requires at least one PC feature for personalization.")
|
|
313
|
+
|
|
314
|
+
dense_features = merge_features(self.main_dense_features, self.pc_dense_features)
|
|
315
|
+
sparse_features = merge_features(self.main_sparse_features, self.pc_sparse_features)
|
|
316
|
+
sequence_features = merge_features(self.main_sequence_features, self.pc_sequence_features)
|
|
317
|
+
|
|
318
|
+
super().__init__(
|
|
319
|
+
dense_features=dense_features,
|
|
320
|
+
sparse_features=sparse_features,
|
|
321
|
+
sequence_features=sequence_features,
|
|
322
|
+
target=target,
|
|
323
|
+
task=task,
|
|
324
|
+
device=device,
|
|
325
|
+
embedding_l1_reg=embedding_l1_reg,
|
|
326
|
+
dense_l1_reg=dense_l1_reg,
|
|
327
|
+
embedding_l2_reg=embedding_l2_reg,
|
|
328
|
+
dense_l2_reg=dense_l2_reg,
|
|
329
|
+
early_stop_patience=20,
|
|
330
|
+
**kwargs,
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
self.loss = loss if loss is not None else "bce"
|
|
334
|
+
optimizer_params = optimizer_params or {}
|
|
335
|
+
|
|
336
|
+
self.num_tasks = len(target)
|
|
337
|
+
if len(tower_params_list) != self.num_tasks:
|
|
338
|
+
raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
|
|
339
|
+
|
|
340
|
+
self.main_features = self.main_dense_features + self.main_sparse_features + self.main_sequence_features
|
|
341
|
+
self.pc_features = self.pc_dense_features + self.pc_sparse_features + self.pc_sequence_features
|
|
342
|
+
|
|
343
|
+
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
344
|
+
self.main_input_dim = self.embedding.get_input_dim(self.main_features)
|
|
345
|
+
self.pc_input_dim = self.embedding.get_input_dim(self.pc_features)
|
|
346
|
+
|
|
347
|
+
self.architecture = architecture.lower()
|
|
348
|
+
if self.architecture not in {"mlp", "mmoe"}:
|
|
349
|
+
raise ValueError(f"Unsupported architecture '{architecture}', choose from ['mlp', 'mmoe'].")
|
|
350
|
+
|
|
351
|
+
# Build backbones
|
|
352
|
+
if self.architecture == "mlp":
|
|
353
|
+
self.towers = nn.ModuleList()
|
|
354
|
+
self.tower_heads = nn.ModuleList()
|
|
355
|
+
for tower_params in tower_params_list:
|
|
356
|
+
dims = tower_params.get("dims")
|
|
357
|
+
if not dims:
|
|
358
|
+
raise ValueError("tower_params must include a non-empty 'dims' list for POSO-MLP towers.")
|
|
359
|
+
dropout = tower_params.get("dropout", 0.0)
|
|
360
|
+
tower = POSOMLP(
|
|
361
|
+
input_dim=self.main_input_dim,
|
|
362
|
+
pc_dim=self.pc_input_dim,
|
|
363
|
+
dims=dims,
|
|
364
|
+
gate_hidden_dim=tower_params.get("gate_hidden_dim", gate_hidden_dim),
|
|
365
|
+
scale_factor=tower_params.get("scale_factor", gate_scale_factor),
|
|
366
|
+
activation=tower_params.get("activation", gate_activation),
|
|
367
|
+
use_bias=tower_params.get("use_bias", gate_use_bias),
|
|
368
|
+
dropout=dropout,
|
|
369
|
+
)
|
|
370
|
+
self.towers.append(tower)
|
|
371
|
+
tower_output_dim = dims[-1] if dims else self.main_input_dim
|
|
372
|
+
self.tower_heads.append(nn.Linear(tower_output_dim, 1))
|
|
373
|
+
else:
|
|
374
|
+
if expert_hidden_dims is None or not expert_hidden_dims:
|
|
375
|
+
raise ValueError("expert_hidden_dims must be provided for MMoE architecture.")
|
|
376
|
+
self.mmoe = POSOMMoE(
|
|
377
|
+
input_dim=self.main_input_dim,
|
|
378
|
+
pc_dim=self.pc_input_dim,
|
|
379
|
+
num_experts=num_experts,
|
|
380
|
+
expert_hidden_dims=expert_hidden_dims,
|
|
381
|
+
num_tasks=self.num_tasks,
|
|
382
|
+
activation=expert_activation,
|
|
383
|
+
expert_dropout=expert_dropout,
|
|
384
|
+
gate_hidden_dim=expert_gate_hidden_dim,
|
|
385
|
+
scale_factor=expert_gate_scale_factor,
|
|
386
|
+
gate_use_softmax=gate_use_softmax,
|
|
387
|
+
)
|
|
388
|
+
self.towers = nn.ModuleList([MLP(input_dim=self.mmoe.expert_output_dim, output_layer=True, **tower_params,) for tower_params in tower_params_list])
|
|
389
|
+
self.tower_heads = None
|
|
390
|
+
self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1] * self.num_tasks,)
|
|
391
|
+
include_modules = ["towers", "tower_heads"] if self.architecture == "mlp" else ["mmoe", "towers"]
|
|
392
|
+
self._register_regularization_weights(embedding_attr="embedding", include_modules=include_modules)
|
|
393
|
+
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
|
|
394
|
+
|
|
395
|
+
def forward(self, x):
|
|
396
|
+
# Embed main and PC features separately so PC can gate hidden units
|
|
397
|
+
main_input = self.embedding(x=x, features=self.main_features, squeeze_dim=True)
|
|
398
|
+
pc_input = self.embedding(x=x, features=self.pc_features, squeeze_dim=True)
|
|
399
|
+
|
|
400
|
+
task_outputs = []
|
|
401
|
+
if self.architecture == "mlp":
|
|
402
|
+
for tower, head in zip(self.towers, self.tower_heads):
|
|
403
|
+
hidden = tower(main_input, pc_input)
|
|
404
|
+
logit = head(hidden)
|
|
405
|
+
task_outputs.append(logit)
|
|
406
|
+
else:
|
|
407
|
+
expert_outputs = self.mmoe(main_input, pc_input)
|
|
408
|
+
for idx, tower in enumerate(self.towers):
|
|
409
|
+
logit = tower(expert_outputs[idx])
|
|
410
|
+
task_outputs.append(logit)
|
|
411
|
+
|
|
412
|
+
y = torch.cat(task_outputs, dim=1)
|
|
413
|
+
return self.prediction_layer(y)
|
|
@@ -1,7 +1,42 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
+
Checkpoint: edit on 24/11/2025
|
|
3
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
4
|
-
Reference:
|
|
5
|
+
Reference:
|
|
6
|
+
[1] Caruana R. Multitask learning[J]. Machine Learning, 1997, 28: 41-75.
|
|
7
|
+
(https://link.springer.com/article/10.1023/A:1007379606734)
|
|
8
|
+
|
|
9
|
+
Shared-Bottom is the classic hard-parameter-sharing baseline for multi-task learning.
|
|
10
|
+
All tasks share a common bottom network to learn general representations, and each
|
|
11
|
+
task has its own tower head for task-specific refinement and prediction. This
|
|
12
|
+
architecture is simple, parameter-efficient, and helps regularize related tasks.
|
|
13
|
+
|
|
14
|
+
Workflow:
|
|
15
|
+
(1) Unified embeddings convert dense/sparse/sequence features
|
|
16
|
+
(2) A shared bottom MLP learns common representations
|
|
17
|
+
(3) Task-specific towers further transform the shared features
|
|
18
|
+
(4) Separate prediction heads output each task’s logits/probabilities
|
|
19
|
+
|
|
20
|
+
Key Advantages:
|
|
21
|
+
- Strong inductive bias via hard parameter sharing, reducing overfitting
|
|
22
|
+
- Parameter-efficient compared to duplicating full models per task
|
|
23
|
+
- Easy to extend to many tasks with small incremental cost
|
|
24
|
+
- Serves as a stable baseline for evaluating advanced MTL architectures
|
|
25
|
+
|
|
26
|
+
Share-Bottom(硬共享底层)是多任务学习的经典基线:所有任务共享一个底层网络,
|
|
27
|
+
各任务拥有独立塔头进行细化与预测,简单高效且能通过共享正则化相关任务。
|
|
28
|
+
|
|
29
|
+
流程:
|
|
30
|
+
(1) 统一 embedding 处理稠密、稀疏与序列特征
|
|
31
|
+
(2) 共享底层 MLP 学习通用表示
|
|
32
|
+
(3) 任务塔在共享表示上做任务特定变换
|
|
33
|
+
(4) 各任务预测头输出对应结果
|
|
34
|
+
|
|
35
|
+
主要优点:
|
|
36
|
+
- 硬参数共享提供强正则,减少过拟合
|
|
37
|
+
- 相比单独模型更节省参数与计算
|
|
38
|
+
- 易于扩展到多任务,增量开销小
|
|
39
|
+
- 是评估更复杂 MTL 结构的稳健基线
|
|
5
40
|
"""
|
|
6
41
|
|
|
7
42
|
import torch
|
|
@@ -59,22 +94,17 @@ class ShareBottom(BaseModel):
|
|
|
59
94
|
self.loss = loss
|
|
60
95
|
if self.loss is None:
|
|
61
96
|
self.loss = "bce"
|
|
62
|
-
|
|
63
97
|
# Number of tasks
|
|
64
98
|
self.num_tasks = len(target)
|
|
65
99
|
if len(tower_params_list) != self.num_tasks:
|
|
66
100
|
raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
|
|
67
|
-
|
|
68
|
-
# All features
|
|
69
|
-
self.all_features = dense_features + sparse_features + sequence_features
|
|
70
|
-
|
|
71
101
|
# Embedding layer
|
|
72
102
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
73
|
-
|
|
74
103
|
# Calculate input dimension
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
104
|
+
input_dim = self.embedding.input_dim
|
|
105
|
+
# emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
|
|
106
|
+
# dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
|
|
107
|
+
# input_dim = emb_dim_total + dense_input_dim
|
|
78
108
|
|
|
79
109
|
# Shared bottom network
|
|
80
110
|
self.bottom = MLP(input_dim=input_dim, output_layer=False, **bottom_params)
|
|
@@ -90,23 +120,10 @@ class ShareBottom(BaseModel):
|
|
|
90
120
|
for tower_params in tower_params_list:
|
|
91
121
|
tower = MLP(input_dim=bottom_output_dim, output_layer=True, **tower_params)
|
|
92
122
|
self.towers.append(tower)
|
|
93
|
-
self.prediction_layer = PredictionLayer(
|
|
94
|
-
task_type=self.task_type,
|
|
95
|
-
task_dims=[1] * self.num_tasks
|
|
96
|
-
)
|
|
97
|
-
|
|
123
|
+
self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1] * self.num_tasks)
|
|
98
124
|
# Register regularization weights
|
|
99
|
-
self._register_regularization_weights(
|
|
100
|
-
|
|
101
|
-
include_modules=['bottom', 'towers']
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
self.compile(
|
|
105
|
-
optimizer=optimizer,
|
|
106
|
-
optimizer_params=optimizer_params,
|
|
107
|
-
loss=loss,
|
|
108
|
-
loss_params=loss_params,
|
|
109
|
-
)
|
|
125
|
+
self._register_regularization_weights(embedding_attr='embedding', include_modules=['bottom', 'towers'])
|
|
126
|
+
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
|
|
110
127
|
|
|
111
128
|
def forward(self, x):
|
|
112
129
|
# Get all embeddings and flatten
|
|
@@ -7,6 +7,7 @@ from .autoint import AutoInt
|
|
|
7
7
|
from .widedeep import WideDeep
|
|
8
8
|
from .xdeepfm import xDeepFM
|
|
9
9
|
from .dcn import DCN
|
|
10
|
+
from .fibinet import FiBiNET
|
|
10
11
|
from .din import DIN
|
|
11
12
|
from .dien import DIEN
|
|
12
13
|
|
|
@@ -22,4 +23,5 @@ __all__ = [
|
|
|
22
23
|
'AFM',
|
|
23
24
|
'MaskNet',
|
|
24
25
|
'PNN',
|
|
26
|
+
'FiBiNET',
|
|
25
27
|
]
|
nextrec/models/ranking/dcn.py
CHANGED
|
@@ -12,9 +12,28 @@ import torch
|
|
|
12
12
|
import torch.nn as nn
|
|
13
13
|
|
|
14
14
|
from nextrec.basic.model import BaseModel
|
|
15
|
-
from nextrec.basic.layers import EmbeddingLayer, MLP,
|
|
15
|
+
from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
|
|
16
16
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
17
17
|
|
|
18
|
+
class CrossNetwork(nn.Module):
|
|
19
|
+
"""Stacked Cross Layers from DCN (Wang et al., 2017)."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, input_dim, num_layers):
|
|
22
|
+
super().__init__()
|
|
23
|
+
self.num_layers = num_layers
|
|
24
|
+
self.w = torch.nn.ModuleList([torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)])
|
|
25
|
+
self.b = torch.nn.ParameterList([torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)])
|
|
26
|
+
|
|
27
|
+
def forward(self, x):
|
|
28
|
+
"""
|
|
29
|
+
:param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
|
|
30
|
+
"""
|
|
31
|
+
x0 = x
|
|
32
|
+
for i in range(self.num_layers):
|
|
33
|
+
xw = self.w[i](x)
|
|
34
|
+
x = x0 * xw + self.b[i] + x
|
|
35
|
+
return x
|
|
36
|
+
|
|
18
37
|
|
|
19
38
|
class DCN(BaseModel):
|
|
20
39
|
@property
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Date: create on 09/11/2025
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import torch
|
|
6
|
+
import torch.nn as nn
|
|
7
|
+
|
|
8
|
+
from nextrec.basic.model import BaseModel
|
|
9
|
+
from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
|
|
10
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
11
|
+
|
|
12
|
+
class CrossNetV2(nn.Module):
|
|
13
|
+
"""Vector-wise cross network proposed in DCN V2 (Wang et al., 2021)."""
|
|
14
|
+
def __init__(self, input_dim, num_layers):
|
|
15
|
+
super().__init__()
|
|
16
|
+
self.num_layers = num_layers
|
|
17
|
+
self.w = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim, bias=False) for _ in range(num_layers)])
|
|
18
|
+
self.b = torch.nn.ParameterList([torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)])
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def forward(self, x):
|
|
22
|
+
x0 = x
|
|
23
|
+
for i in range(self.num_layers):
|
|
24
|
+
x =x0*self.w[i](x) + self.b[i] + x
|
|
25
|
+
return x
|
|
26
|
+
|
|
27
|
+
class CrossNetMix(nn.Module):
|
|
28
|
+
"""Mixture of low-rank cross experts from DCN V2 (Wang et al., 2021)."""
|
|
29
|
+
|
|
30
|
+
def __init__(self, input_dim, num_layers=2, low_rank=32, num_experts=4):
|
|
31
|
+
super(CrossNetMix, self).__init__()
|
|
32
|
+
self.num_layers = num_layers
|
|
33
|
+
self.num_experts = num_experts
|
|
34
|
+
|
|
35
|
+
# U: (input_dim, low_rank)
|
|
36
|
+
self.u_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
|
|
37
|
+
torch.empty(num_experts, input_dim, low_rank))) for i in range(self.num_layers)])
|
|
38
|
+
# V: (input_dim, low_rank)
|
|
39
|
+
self.v_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
|
|
40
|
+
torch.empty(num_experts, input_dim, low_rank))) for i in range(self.num_layers)])
|
|
41
|
+
# C: (low_rank, low_rank)
|
|
42
|
+
self.c_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_(
|
|
43
|
+
torch.empty(num_experts, low_rank, low_rank))) for i in range(self.num_layers)])
|
|
44
|
+
self.gating = nn.ModuleList([nn.Linear(input_dim, 1, bias=False) for i in range(self.num_experts)])
|
|
45
|
+
|
|
46
|
+
self.bias = torch.nn.ParameterList([nn.Parameter(nn.init.zeros_(
|
|
47
|
+
torch.empty(input_dim, 1))) for i in range(self.num_layers)])
|
|
48
|
+
|
|
49
|
+
def forward(self, x):
|
|
50
|
+
x_0 = x.unsqueeze(2) # (bs, in_features, 1)
|
|
51
|
+
x_l = x_0
|
|
52
|
+
for i in range(self.num_layers):
|
|
53
|
+
output_of_experts = []
|
|
54
|
+
gating_score_experts = []
|
|
55
|
+
for expert_id in range(self.num_experts):
|
|
56
|
+
# (1) G(x_l)
|
|
57
|
+
# compute the gating score by x_l
|
|
58
|
+
gating_score_experts.append(self.gating[expert_id](x_l.squeeze(2)))
|
|
59
|
+
|
|
60
|
+
# (2) E(x_l)
|
|
61
|
+
# project the input x_l to $\mathbb{R}^{r}$
|
|
62
|
+
v_x = torch.matmul(self.v_list[i][expert_id].t(), x_l) # (bs, low_rank, 1)
|
|
63
|
+
|
|
64
|
+
# nonlinear activation in low rank space
|
|
65
|
+
v_x = torch.tanh(v_x)
|
|
66
|
+
v_x = torch.matmul(self.c_list[i][expert_id], v_x)
|
|
67
|
+
v_x = torch.tanh(v_x)
|
|
68
|
+
|
|
69
|
+
# project back to $\mathbb{R}^{d}$
|
|
70
|
+
uv_x = torch.matmul(self.u_list[i][expert_id], v_x) # (bs, in_features, 1)
|
|
71
|
+
|
|
72
|
+
dot_ = uv_x + self.bias[i]
|
|
73
|
+
dot_ = x_0 * dot_ # Hadamard-product
|
|
74
|
+
|
|
75
|
+
output_of_experts.append(dot_.squeeze(2))
|
|
76
|
+
|
|
77
|
+
# (3) mixture of low-rank experts
|
|
78
|
+
output_of_experts = torch.stack(output_of_experts, 2) # (bs, in_features, num_experts)
|
|
79
|
+
gating_score_experts = torch.stack(gating_score_experts, 1) # (bs, num_experts, 1)
|
|
80
|
+
moe_out = torch.matmul(output_of_experts, gating_score_experts.softmax(1))
|
|
81
|
+
x_l = moe_out + x_l # (bs, in_features, 1)
|
|
82
|
+
|
|
83
|
+
x_l = x_l.squeeze() # (bs, in_features)
|
|
84
|
+
return x_l
|