nextrec 0.4.25__py3-none-any.whl → 0.4.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/asserts.py +72 -0
- nextrec/basic/loggers.py +18 -1
- nextrec/basic/model.py +54 -51
- nextrec/data/batch_utils.py +23 -3
- nextrec/data/dataloader.py +3 -8
- nextrec/models/multi_task/[pre]aitm.py +173 -0
- nextrec/models/multi_task/[pre]snr_trans.py +232 -0
- nextrec/models/multi_task/[pre]star.py +192 -0
- nextrec/models/multi_task/apg.py +330 -0
- nextrec/models/multi_task/cross_stitch.py +229 -0
- nextrec/models/multi_task/escm.py +290 -0
- nextrec/models/multi_task/esmm.py +8 -21
- nextrec/models/multi_task/hmoe.py +203 -0
- nextrec/models/multi_task/mmoe.py +20 -28
- nextrec/models/multi_task/pepnet.py +81 -76
- nextrec/models/multi_task/ple.py +30 -44
- nextrec/models/multi_task/poso.py +13 -22
- nextrec/models/multi_task/share_bottom.py +14 -25
- nextrec/models/ranking/afm.py +2 -2
- nextrec/models/ranking/autoint.py +2 -4
- nextrec/models/ranking/dcn.py +2 -3
- nextrec/models/ranking/dcn_v2.py +2 -3
- nextrec/models/ranking/deepfm.py +2 -3
- nextrec/models/ranking/dien.py +7 -9
- nextrec/models/ranking/din.py +8 -10
- nextrec/models/ranking/eulernet.py +1 -2
- nextrec/models/ranking/ffm.py +1 -2
- nextrec/models/ranking/fibinet.py +2 -3
- nextrec/models/ranking/fm.py +1 -1
- nextrec/models/ranking/lr.py +1 -1
- nextrec/models/ranking/masknet.py +1 -2
- nextrec/models/ranking/pnn.py +1 -2
- nextrec/models/ranking/widedeep.py +2 -3
- nextrec/models/ranking/xdeepfm.py +2 -4
- nextrec/models/representation/rqvae.py +4 -4
- nextrec/models/retrieval/dssm.py +18 -26
- nextrec/models/retrieval/dssm_v2.py +15 -22
- nextrec/models/retrieval/mind.py +9 -15
- nextrec/models/retrieval/sdm.py +36 -33
- nextrec/models/retrieval/youtube_dnn.py +16 -24
- nextrec/models/sequential/hstu.py +2 -2
- nextrec/utils/__init__.py +5 -1
- nextrec/utils/model.py +9 -14
- {nextrec-0.4.25.dist-info → nextrec-0.4.28.dist-info}/METADATA +72 -62
- nextrec-0.4.28.dist-info/RECORD +90 -0
- nextrec/models/multi_task/aitm.py +0 -0
- nextrec/models/multi_task/snr_trans.py +0 -0
- nextrec-0.4.25.dist-info/RECORD +0 -86
- {nextrec-0.4.25.dist-info → nextrec-0.4.28.dist-info}/WHEEL +0 -0
- {nextrec-0.4.25.dist-info → nextrec-0.4.28.dist-info}/entry_points.txt +0 -0
- {nextrec-0.4.25.dist-info → nextrec-0.4.28.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Date: create on
|
|
3
|
-
Checkpoint: edit on
|
|
2
|
+
Date: create on 01/01/2026
|
|
3
|
+
Checkpoint: edit on 01/01/2026
|
|
4
4
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
[1]
|
|
7
|
-
|
|
8
|
-
https://github.com/alipay/MMLRec-A-Unified-Multi-Task-and-Multi-Scenario-Learning-Benchmark-for-Recommendation/
|
|
6
|
+
- [1] Chang J, Zhang C, Hui Y, Leng D, Niu Y, Song Y, Gai K. PEPNet: Parameter and Embedding Personalized Network for Infusing with Personalized Prior Information. In: Proceedings of the 29th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD ’23), 2023.
|
|
7
|
+
URL: https://arxiv.org/abs/2302.01115
|
|
8
|
+
- [2] MMLRec-A-Unified-Multi-Task-and-Multi-Scenario-Learning-Benchmark-for-Recommendation: https://github.com/alipay/MMLRec-A-Unified-Multi-Task-and-Multi-Scenario-Learning-Benchmark-for-Recommendation/
|
|
9
9
|
|
|
10
10
|
PEPNet (Parameter and Embedding Personalized Network) is a multi-task learning
|
|
11
11
|
model that personalizes both input features and layer transformations with
|
|
@@ -58,12 +58,12 @@ from nextrec.basic.layers import EmbeddingLayer, GateMLP
|
|
|
58
58
|
from nextrec.basic.heads import TaskHead
|
|
59
59
|
from nextrec.basic.model import BaseModel
|
|
60
60
|
from nextrec.utils.model import select_features
|
|
61
|
-
from nextrec.utils.types import
|
|
61
|
+
from nextrec.utils.types import TaskTypeName
|
|
62
62
|
|
|
63
63
|
|
|
64
|
-
class
|
|
64
|
+
class PPNet(nn.Module):
|
|
65
65
|
"""
|
|
66
|
-
|
|
66
|
+
PPNet: per-task tower with layer-wise gates conditioned on task context.
|
|
67
67
|
"""
|
|
68
68
|
|
|
69
69
|
def __init__(
|
|
@@ -71,34 +71,42 @@ class PPNetBlock(nn.Module):
|
|
|
71
71
|
input_dim: int,
|
|
72
72
|
output_dim: int,
|
|
73
73
|
gate_input_dim: int,
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
hidden_activations: ActivationName | list[ActivationName] = "relu",
|
|
77
|
-
dropout_rates: float | list[float] = 0.0,
|
|
78
|
-
batch_norm: bool = False,
|
|
74
|
+
mlp_params: dict | None = None,
|
|
75
|
+
gate_mlp_params: dict | None = None,
|
|
79
76
|
use_bias: bool = True,
|
|
80
|
-
gate_activation: ActivationName = "relu",
|
|
81
|
-
gate_dropout: float = 0.0,
|
|
82
|
-
gate_use_bn: bool = False,
|
|
83
77
|
) -> None:
|
|
84
78
|
super().__init__()
|
|
85
|
-
|
|
79
|
+
mlp_params = mlp_params or {}
|
|
80
|
+
gate_mlp_params = gate_mlp_params or {}
|
|
86
81
|
|
|
87
|
-
|
|
88
|
-
|
|
82
|
+
mlp_params.setdefault("hidden_dims", [])
|
|
83
|
+
mlp_params.setdefault("activation", "relu")
|
|
84
|
+
mlp_params.setdefault("dropout", 0.0)
|
|
85
|
+
mlp_params.setdefault("norm_type", "none")
|
|
86
|
+
|
|
87
|
+
gate_mlp_params.setdefault("hidden_dim", None)
|
|
88
|
+
gate_mlp_params.setdefault("activation", "relu")
|
|
89
|
+
gate_mlp_params.setdefault("dropout", 0.0)
|
|
90
|
+
gate_mlp_params.setdefault("use_bn", False)
|
|
91
|
+
|
|
92
|
+
hidden_units = mlp_params["hidden_dims"]
|
|
93
|
+
norm_type = mlp_params["norm_type"]
|
|
94
|
+
|
|
95
|
+
if isinstance(mlp_params["dropout"], list):
|
|
96
|
+
if len(mlp_params["dropout"]) != len(hidden_units):
|
|
89
97
|
raise ValueError("dropout_rates length must match hidden_units length.")
|
|
90
|
-
dropout_list =
|
|
98
|
+
dropout_list = mlp_params["dropout"]
|
|
91
99
|
else:
|
|
92
|
-
dropout_list = [
|
|
100
|
+
dropout_list = [mlp_params["dropout"]] * len(hidden_units)
|
|
93
101
|
|
|
94
|
-
if isinstance(
|
|
95
|
-
if len(
|
|
102
|
+
if isinstance(mlp_params["activation"], list):
|
|
103
|
+
if len(mlp_params["activation"]) != len(hidden_units):
|
|
96
104
|
raise ValueError(
|
|
97
105
|
"hidden_activations length must match hidden_units length."
|
|
98
106
|
)
|
|
99
|
-
activation_list =
|
|
107
|
+
activation_list = mlp_params["activation"]
|
|
100
108
|
else:
|
|
101
|
-
activation_list = [
|
|
109
|
+
activation_list = [mlp_params["activation"]] * len(hidden_units)
|
|
102
110
|
|
|
103
111
|
self.gate_layers = nn.ModuleList()
|
|
104
112
|
self.mlp_layers = nn.ModuleList()
|
|
@@ -108,7 +116,7 @@ class PPNetBlock(nn.Module):
|
|
|
108
116
|
dense_layers: list[nn.Module] = [
|
|
109
117
|
nn.Linear(layer_units[idx], layer_units[idx + 1], bias=use_bias)
|
|
110
118
|
]
|
|
111
|
-
if batch_norm:
|
|
119
|
+
if norm_type == "batch_norm":
|
|
112
120
|
dense_layers.append(nn.BatchNorm1d(layer_units[idx + 1]))
|
|
113
121
|
dense_layers.append(activation_layer(activation_list[idx]))
|
|
114
122
|
if dropout_list[idx] > 0:
|
|
@@ -117,11 +125,11 @@ class PPNetBlock(nn.Module):
|
|
|
117
125
|
self.gate_layers.append(
|
|
118
126
|
GateMLP(
|
|
119
127
|
input_dim=gate_input_dim,
|
|
120
|
-
hidden_dim=
|
|
128
|
+
hidden_dim=gate_mlp_params["hidden_dim"],
|
|
121
129
|
output_dim=layer_units[idx],
|
|
122
|
-
activation=
|
|
123
|
-
dropout=
|
|
124
|
-
use_bn=
|
|
130
|
+
activation=gate_mlp_params["activation"],
|
|
131
|
+
dropout=gate_mlp_params["dropout"],
|
|
132
|
+
use_bn=gate_mlp_params["use_bn"],
|
|
125
133
|
scale_factor=2.0,
|
|
126
134
|
)
|
|
127
135
|
)
|
|
@@ -130,11 +138,11 @@ class PPNetBlock(nn.Module):
|
|
|
130
138
|
self.gate_layers.append(
|
|
131
139
|
GateMLP(
|
|
132
140
|
input_dim=gate_input_dim,
|
|
133
|
-
hidden_dim=
|
|
141
|
+
hidden_dim=gate_mlp_params["hidden_dim"],
|
|
134
142
|
output_dim=layer_units[-1],
|
|
135
|
-
activation=
|
|
136
|
-
dropout=
|
|
137
|
-
use_bn=
|
|
143
|
+
activation=gate_mlp_params["activation"],
|
|
144
|
+
dropout=gate_mlp_params["dropout"],
|
|
145
|
+
use_bn=gate_mlp_params["use_bn"],
|
|
138
146
|
scale_factor=1.0,
|
|
139
147
|
)
|
|
140
148
|
)
|
|
@@ -177,15 +185,9 @@ class PEPNet(BaseModel):
|
|
|
177
185
|
sequence_features: list[SequenceFeature] | None = None,
|
|
178
186
|
target: list[str] | str | None = None,
|
|
179
187
|
task: TaskTypeName | list[TaskTypeName] | None = None,
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
dnn_use_bn: bool = False,
|
|
184
|
-
feature_gate_hidden_dim: int = 128,
|
|
185
|
-
gate_hidden_dim: int | None = None,
|
|
186
|
-
gate_activation: ActivationName = "relu",
|
|
187
|
-
gate_dropout: float = 0.0,
|
|
188
|
-
gate_use_bn: bool = False,
|
|
188
|
+
mlp_params: dict | None = None,
|
|
189
|
+
feature_gate_mlp_params: dict | None = None,
|
|
190
|
+
gate_mlp_params: dict | None = None,
|
|
189
191
|
domain_features: list[str] | str | None = None,
|
|
190
192
|
user_features: list[str] | str | None = None,
|
|
191
193
|
item_features: list[str] | str | None = None,
|
|
@@ -195,7 +197,24 @@ class PEPNet(BaseModel):
|
|
|
195
197
|
dense_features = dense_features or []
|
|
196
198
|
sparse_features = sparse_features or []
|
|
197
199
|
sequence_features = sequence_features or []
|
|
198
|
-
|
|
200
|
+
mlp_params = mlp_params or {}
|
|
201
|
+
feature_gate_mlp_params = feature_gate_mlp_params or {}
|
|
202
|
+
gate_mlp_params = gate_mlp_params or {}
|
|
203
|
+
|
|
204
|
+
mlp_params.setdefault("hidden_dims", [256, 128])
|
|
205
|
+
mlp_params.setdefault("activation", "relu")
|
|
206
|
+
mlp_params.setdefault("dropout", 0.0)
|
|
207
|
+
mlp_params.setdefault("norm_type", "none")
|
|
208
|
+
|
|
209
|
+
feature_gate_mlp_params.setdefault("hidden_dim", 128)
|
|
210
|
+
feature_gate_mlp_params.setdefault("activation", "relu")
|
|
211
|
+
feature_gate_mlp_params.setdefault("dropout", 0.0)
|
|
212
|
+
feature_gate_mlp_params.setdefault("use_bn", False)
|
|
213
|
+
|
|
214
|
+
gate_mlp_params.setdefault("hidden_dim", None)
|
|
215
|
+
gate_mlp_params.setdefault("activation", "relu")
|
|
216
|
+
gate_mlp_params.setdefault("dropout", 0.0)
|
|
217
|
+
gate_mlp_params.setdefault("use_bn", False)
|
|
199
218
|
|
|
200
219
|
if target is None:
|
|
201
220
|
target = []
|
|
@@ -203,24 +222,13 @@ class PEPNet(BaseModel):
|
|
|
203
222
|
target = [target]
|
|
204
223
|
|
|
205
224
|
self.nums_task = len(target) if target else 1
|
|
206
|
-
resolved_task = task
|
|
207
|
-
if resolved_task is None:
|
|
208
|
-
resolved_task = self.default_task
|
|
209
|
-
elif isinstance(resolved_task, str):
|
|
210
|
-
resolved_task = [resolved_task] * self.nums_task
|
|
211
|
-
elif len(resolved_task) == 1 and self.nums_task > 1:
|
|
212
|
-
resolved_task = resolved_task * self.nums_task
|
|
213
|
-
elif len(resolved_task) != self.nums_task:
|
|
214
|
-
raise ValueError(
|
|
215
|
-
f"Length of task ({len(resolved_task)}) must match number of targets ({self.nums_task})."
|
|
216
|
-
)
|
|
217
225
|
|
|
218
226
|
super().__init__(
|
|
219
227
|
dense_features=dense_features,
|
|
220
228
|
sparse_features=sparse_features,
|
|
221
229
|
sequence_features=sequence_features,
|
|
222
230
|
target=target,
|
|
223
|
-
task=
|
|
231
|
+
task=task,
|
|
224
232
|
**kwargs,
|
|
225
233
|
)
|
|
226
234
|
|
|
@@ -266,30 +274,27 @@ class PEPNet(BaseModel):
|
|
|
266
274
|
)
|
|
267
275
|
task_dim = domain_dim + user_dim + item_dim
|
|
268
276
|
|
|
269
|
-
|
|
277
|
+
# EPNet: shared feature-level gate (paper's EPNet).
|
|
278
|
+
self.epnet = GateMLP(
|
|
270
279
|
input_dim=input_dim + domain_dim,
|
|
271
|
-
hidden_dim=
|
|
280
|
+
hidden_dim=feature_gate_mlp_params["hidden_dim"],
|
|
272
281
|
output_dim=input_dim,
|
|
273
|
-
activation=
|
|
274
|
-
dropout=
|
|
275
|
-
use_bn=
|
|
282
|
+
activation=feature_gate_mlp_params["activation"],
|
|
283
|
+
dropout=feature_gate_mlp_params["dropout"],
|
|
284
|
+
use_bn=feature_gate_mlp_params["use_bn"],
|
|
285
|
+
scale_factor=2.0,
|
|
276
286
|
)
|
|
277
287
|
|
|
278
|
-
|
|
288
|
+
# PPNet: per-task gated towers (paper's PPNet).
|
|
289
|
+
self.ppnet_blocks = nn.ModuleList(
|
|
279
290
|
[
|
|
280
|
-
|
|
291
|
+
PPNet(
|
|
281
292
|
input_dim=input_dim,
|
|
282
293
|
output_dim=1,
|
|
283
294
|
gate_input_dim=input_dim + task_dim,
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
hidden_activations=dnn_activation,
|
|
287
|
-
dropout_rates=dnn_dropout,
|
|
288
|
-
batch_norm=dnn_use_bn,
|
|
295
|
+
mlp_params=mlp_params,
|
|
296
|
+
gate_mlp_params=gate_mlp_params,
|
|
289
297
|
use_bias=use_bias,
|
|
290
|
-
gate_activation=gate_activation,
|
|
291
|
-
gate_dropout=gate_dropout,
|
|
292
|
-
gate_use_bn=gate_use_bn,
|
|
293
298
|
)
|
|
294
299
|
for _ in range(self.nums_task)
|
|
295
300
|
]
|
|
@@ -298,9 +303,9 @@ class PEPNet(BaseModel):
|
|
|
298
303
|
self.prediction_layer = TaskHead(
|
|
299
304
|
task_type=self.task, task_dims=[1] * self.nums_task
|
|
300
305
|
)
|
|
301
|
-
self.grad_norm_shared_modules = ["embedding", "
|
|
306
|
+
self.grad_norm_shared_modules = ["embedding", "epnet"]
|
|
302
307
|
self.register_regularization_weights(
|
|
303
|
-
embedding_attr="embedding", include_modules=["
|
|
308
|
+
embedding_attr="embedding", include_modules=["epnet", "ppnet_blocks"]
|
|
304
309
|
)
|
|
305
310
|
|
|
306
311
|
def forward(self, x: dict[str, torch.Tensor]) -> torch.Tensor:
|
|
@@ -325,11 +330,11 @@ class PEPNet(BaseModel):
|
|
|
325
330
|
task_sf_emb = torch.cat(task_parts, dim=-1)
|
|
326
331
|
|
|
327
332
|
gate_input = torch.cat([dnn_input.detach(), domain_emb], dim=-1)
|
|
328
|
-
dnn_input = self.
|
|
333
|
+
dnn_input = self.epnet(gate_input) * dnn_input
|
|
329
334
|
|
|
330
335
|
task_logits = []
|
|
331
|
-
for block in self.
|
|
332
|
-
|
|
336
|
+
for block in self.ppnet_blocks:
|
|
337
|
+
task_logits.append(block(o_ep=dnn_input, o_prior=task_sf_emb))
|
|
333
338
|
|
|
334
339
|
y = torch.cat(task_logits, dim=1)
|
|
335
340
|
return self.prediction_layer(y)
|
nextrec/models/multi_task/ple.py
CHANGED
|
@@ -3,9 +3,8 @@ Date: create on 09/11/2025
|
|
|
3
3
|
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
[1] Tang H, Liu J, Zhao M,
|
|
7
|
-
|
|
8
|
-
(https://dl.acm.org/doi/10.1145/3383313.3412236)
|
|
6
|
+
- [1] Tang H, Liu J, Zhao M, Gong X. Progressive Layered Extraction (PLE): A Novel Multi-Task Learning (MTL) Model for Personalized Recommendations. In: Proceedings of the 14th ACM Conference on Recommender Systems (RecSys ’20), 2020, pp. 269–278.
|
|
7
|
+
URL: https://dl.acm.org/doi/10.1145/3383313.3412236
|
|
9
8
|
|
|
10
9
|
Progressive Layered Extraction (PLE) advances multi-task learning by stacking CGC
|
|
11
10
|
(Customized Gate Control) blocks that mix shared and task-specific experts. Each
|
|
@@ -67,18 +66,21 @@ class CGCLayer(nn.Module):
|
|
|
67
66
|
nums_task: int,
|
|
68
67
|
num_shared_experts: int,
|
|
69
68
|
num_specific_experts: int,
|
|
70
|
-
|
|
71
|
-
|
|
69
|
+
shared_expert_mlp_params: dict,
|
|
70
|
+
specific_expert_mlp_params: list[dict],
|
|
72
71
|
):
|
|
73
72
|
super().__init__()
|
|
74
73
|
if nums_task < 1:
|
|
75
74
|
raise ValueError("nums_task must be >= 1")
|
|
76
75
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
76
|
+
if len(specific_expert_mlp_params) != nums_task:
|
|
77
|
+
raise ValueError(
|
|
78
|
+
"Length of specific_expert_mlp_params "
|
|
79
|
+
f"({len(specific_expert_mlp_params)}) must match number of tasks ({nums_task})."
|
|
80
|
+
)
|
|
81
|
+
specific_params_list = [params.copy() for params in specific_expert_mlp_params]
|
|
80
82
|
|
|
81
|
-
self.output_dim = get_mlp_output_dim(
|
|
83
|
+
self.output_dim = get_mlp_output_dim(shared_expert_mlp_params, input_dim)
|
|
82
84
|
specific_dims = [
|
|
83
85
|
get_mlp_output_dim(params, input_dim) for params in specific_params_list
|
|
84
86
|
]
|
|
@@ -94,7 +96,7 @@ class CGCLayer(nn.Module):
|
|
|
94
96
|
MLP(
|
|
95
97
|
input_dim=input_dim,
|
|
96
98
|
output_dim=None,
|
|
97
|
-
**
|
|
99
|
+
**shared_expert_mlp_params,
|
|
98
100
|
)
|
|
99
101
|
for _ in range(num_shared_experts)
|
|
100
102
|
]
|
|
@@ -166,18 +168,6 @@ class CGCLayer(nn.Module):
|
|
|
166
168
|
|
|
167
169
|
return new_task_fea, new_shared
|
|
168
170
|
|
|
169
|
-
@staticmethod
|
|
170
|
-
def normalize_specific_params(
|
|
171
|
-
params: dict | list[dict], nums_task: int
|
|
172
|
-
) -> list[dict]:
|
|
173
|
-
if isinstance(params, list):
|
|
174
|
-
if len(params) != nums_task:
|
|
175
|
-
raise ValueError(
|
|
176
|
-
f"Length of specific_expert_params ({len(params)}) must match nums_task ({nums_task})."
|
|
177
|
-
)
|
|
178
|
-
return [p.copy() for p in params]
|
|
179
|
-
return [params.copy() for _ in range(nums_task)]
|
|
180
|
-
|
|
181
171
|
|
|
182
172
|
class PLE(BaseModel):
|
|
183
173
|
"""
|
|
@@ -205,12 +195,12 @@ class PLE(BaseModel):
|
|
|
205
195
|
dense_features: list[DenseFeature] | None = None,
|
|
206
196
|
sparse_features: list[SparseFeature] | None = None,
|
|
207
197
|
sequence_features: list[SequenceFeature] | None = None,
|
|
208
|
-
|
|
209
|
-
|
|
198
|
+
shared_expert_mlp_params: dict | None = None,
|
|
199
|
+
specific_expert_mlp_params: list[dict] | None = None,
|
|
210
200
|
num_shared_experts: int = 2,
|
|
211
201
|
num_specific_experts: int = 2,
|
|
212
202
|
num_levels: int = 2,
|
|
213
|
-
|
|
203
|
+
tower_mlp_params_list: list[dict] | None = None,
|
|
214
204
|
target: list[str] | None = None,
|
|
215
205
|
task: str | list[str] | None = None,
|
|
216
206
|
**kwargs,
|
|
@@ -218,24 +208,19 @@ class PLE(BaseModel):
|
|
|
218
208
|
|
|
219
209
|
self.nums_task = len(target) if target is not None else 1
|
|
220
210
|
|
|
221
|
-
|
|
222
|
-
if
|
|
223
|
-
resolved_task = self.default_task
|
|
224
|
-
elif isinstance(resolved_task, str):
|
|
225
|
-
resolved_task = [resolved_task] * self.nums_task
|
|
226
|
-
elif len(resolved_task) == 1 and self.nums_task > 1:
|
|
227
|
-
resolved_task = resolved_task * self.nums_task
|
|
228
|
-
elif len(resolved_task) != self.nums_task:
|
|
211
|
+
shared_expert_mlp_params = shared_expert_mlp_params or {}
|
|
212
|
+
if specific_expert_mlp_params is None:
|
|
229
213
|
raise ValueError(
|
|
230
|
-
|
|
214
|
+
"specific_expert_mlp_params must be a list of dicts, one per task."
|
|
231
215
|
)
|
|
216
|
+
tower_mlp_params_list = tower_mlp_params_list or []
|
|
232
217
|
|
|
233
218
|
super(PLE, self).__init__(
|
|
234
219
|
dense_features=dense_features,
|
|
235
220
|
sparse_features=sparse_features,
|
|
236
221
|
sequence_features=sequence_features,
|
|
237
222
|
target=target,
|
|
238
|
-
task=
|
|
223
|
+
task=task,
|
|
239
224
|
**kwargs,
|
|
240
225
|
)
|
|
241
226
|
|
|
@@ -245,9 +230,10 @@ class PLE(BaseModel):
|
|
|
245
230
|
self.num_specific_experts = num_specific_experts
|
|
246
231
|
self.num_levels = num_levels
|
|
247
232
|
|
|
248
|
-
if len(
|
|
233
|
+
if len(tower_mlp_params_list) != self.nums_task:
|
|
249
234
|
raise ValueError(
|
|
250
|
-
|
|
235
|
+
"Number of tower mlp params "
|
|
236
|
+
f"({len(tower_mlp_params_list)}) must match number of tasks ({self.nums_task})"
|
|
251
237
|
)
|
|
252
238
|
# Embedding layer
|
|
253
239
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
@@ -260,10 +246,10 @@ class PLE(BaseModel):
|
|
|
260
246
|
|
|
261
247
|
# Get expert output dimension
|
|
262
248
|
if (
|
|
263
|
-
"hidden_dims" in
|
|
264
|
-
and len(
|
|
249
|
+
"hidden_dims" in shared_expert_mlp_params
|
|
250
|
+
and len(shared_expert_mlp_params["hidden_dims"]) > 0
|
|
265
251
|
):
|
|
266
|
-
expert_output_dim =
|
|
252
|
+
expert_output_dim = shared_expert_mlp_params["hidden_dims"][-1]
|
|
267
253
|
else:
|
|
268
254
|
expert_output_dim = input_dim
|
|
269
255
|
|
|
@@ -276,8 +262,8 @@ class PLE(BaseModel):
|
|
|
276
262
|
nums_task=self.nums_task,
|
|
277
263
|
num_shared_experts=num_shared_experts,
|
|
278
264
|
num_specific_experts=num_specific_experts,
|
|
279
|
-
|
|
280
|
-
|
|
265
|
+
shared_expert_mlp_params=shared_expert_mlp_params,
|
|
266
|
+
specific_expert_mlp_params=specific_expert_mlp_params,
|
|
281
267
|
)
|
|
282
268
|
self.cgc_layers.append(cgc_layer)
|
|
283
269
|
expert_output_dim = cgc_layer.output_dim
|
|
@@ -285,8 +271,8 @@ class PLE(BaseModel):
|
|
|
285
271
|
|
|
286
272
|
# Task-specific towers
|
|
287
273
|
self.towers = nn.ModuleList()
|
|
288
|
-
for
|
|
289
|
-
tower = MLP(input_dim=expert_output_dim, output_dim=1, **
|
|
274
|
+
for tower_mlp_params in tower_mlp_params_list:
|
|
275
|
+
tower = MLP(input_dim=expert_output_dim, output_dim=1, **tower_mlp_params)
|
|
290
276
|
self.towers.append(tower)
|
|
291
277
|
self.prediction_layer = TaskHead(
|
|
292
278
|
task_type=self.task, task_dims=[1] * self.nums_task
|
|
@@ -3,7 +3,8 @@ Date: create on 28/11/2025
|
|
|
3
3
|
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
[1] Wang
|
|
6
|
+
- [1] Dai S, Lin H, Zhao Z, Lin J, Wu H, Wang Z, Yang S, Liu J. POSO: Personalized Cold Start Modules for Large-scale Recommender Systems. arXiv preprint arXiv:2108.04690, 2021.
|
|
7
|
+
URL: https://arxiv.org/abs/2108.04690
|
|
7
8
|
|
|
8
9
|
POSO (Personalized cOld-start mOdules) augments backbone recommenders by injecting a
|
|
9
10
|
personalized cold-start vector `pc` that gates hidden units layer by layer. Each fully
|
|
@@ -49,6 +50,7 @@ from nextrec.basic.layers import MLP, EmbeddingLayer
|
|
|
49
50
|
from nextrec.basic.heads import TaskHead
|
|
50
51
|
from nextrec.basic.model import BaseModel
|
|
51
52
|
from nextrec.utils.model import select_features
|
|
53
|
+
from nextrec.utils.types import TaskTypeName
|
|
52
54
|
|
|
53
55
|
|
|
54
56
|
class POSOGate(nn.Module):
|
|
@@ -306,9 +308,9 @@ class POSO(BaseModel):
|
|
|
306
308
|
pc_dense_features: list[str] | None,
|
|
307
309
|
pc_sparse_features: list[str] | None,
|
|
308
310
|
pc_sequence_features: list[str] | None,
|
|
309
|
-
|
|
311
|
+
tower_mlp_params_list: list[dict],
|
|
310
312
|
target: list[str] | None = None,
|
|
311
|
-
task:
|
|
313
|
+
task: TaskTypeName | list[TaskTypeName] | None = None,
|
|
312
314
|
architecture: Literal["mlp", "mmoe"] = "mlp",
|
|
313
315
|
# POSO gating defaults
|
|
314
316
|
gate_hidden_dim: int = 32,
|
|
@@ -327,22 +329,10 @@ class POSO(BaseModel):
|
|
|
327
329
|
):
|
|
328
330
|
self.nums_task = len(target)
|
|
329
331
|
|
|
330
|
-
|
|
331
|
-
resolved_task = task
|
|
332
|
-
if resolved_task is None:
|
|
333
|
-
resolved_task = self.default_task
|
|
334
|
-
elif isinstance(resolved_task, str):
|
|
335
|
-
resolved_task = [resolved_task] * self.nums_task
|
|
336
|
-
elif len(resolved_task) == 1 and self.nums_task > 1:
|
|
337
|
-
resolved_task = resolved_task * self.nums_task
|
|
338
|
-
elif len(resolved_task) != self.nums_task:
|
|
332
|
+
if len(tower_mlp_params_list) != self.nums_task:
|
|
339
333
|
raise ValueError(
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
if len(tower_params_list) != self.nums_task:
|
|
344
|
-
raise ValueError(
|
|
345
|
-
f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.nums_task})"
|
|
334
|
+
"Number of tower mlp params "
|
|
335
|
+
f"({len(tower_mlp_params_list)}) must match number of tasks ({self.nums_task})"
|
|
346
336
|
)
|
|
347
337
|
|
|
348
338
|
super().__init__(
|
|
@@ -350,7 +340,7 @@ class POSO(BaseModel):
|
|
|
350
340
|
sparse_features=sparse_features,
|
|
351
341
|
sequence_features=sequence_features,
|
|
352
342
|
target=target,
|
|
353
|
-
task=
|
|
343
|
+
task=task,
|
|
354
344
|
**kwargs,
|
|
355
345
|
)
|
|
356
346
|
|
|
@@ -415,11 +405,12 @@ class POSO(BaseModel):
|
|
|
415
405
|
if self.architecture == "mlp":
|
|
416
406
|
self.towers = nn.ModuleList()
|
|
417
407
|
self.tower_heads = nn.ModuleList()
|
|
418
|
-
for tower_params in
|
|
408
|
+
for tower_params in tower_mlp_params_list:
|
|
419
409
|
hidden_dims = tower_params.get("hidden_dims")
|
|
420
410
|
if not hidden_dims:
|
|
421
411
|
raise ValueError(
|
|
422
|
-
"
|
|
412
|
+
"tower_mlp_params_list must include a non-empty 'hidden_dims' "
|
|
413
|
+
"list for POSO-MLP towers."
|
|
423
414
|
)
|
|
424
415
|
dropout = tower_params.get("dropout", 0.0)
|
|
425
416
|
tower = POSOMLP(
|
|
@@ -463,7 +454,7 @@ class POSO(BaseModel):
|
|
|
463
454
|
output_dim=1,
|
|
464
455
|
**tower_params,
|
|
465
456
|
)
|
|
466
|
-
for tower_params in
|
|
457
|
+
for tower_params in tower_mlp_params_list
|
|
467
458
|
]
|
|
468
459
|
)
|
|
469
460
|
self.tower_heads = None
|
|
@@ -2,9 +2,6 @@
|
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
3
|
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
5
|
-
Reference:
|
|
6
|
-
[1] Caruana R. Multitask learning[J]. Machine Learning, 1997, 28: 41-75.
|
|
7
|
-
(https://link.springer.com/article/10.1023/A:1007379606734)
|
|
8
5
|
|
|
9
6
|
Shared-Bottom is the classic hard-parameter-sharing baseline for multi-task learning.
|
|
10
7
|
All tasks share a common bottom network to learn general representations, and each
|
|
@@ -65,8 +62,8 @@ class ShareBottom(BaseModel):
|
|
|
65
62
|
dense_features: list[DenseFeature],
|
|
66
63
|
sparse_features: list[SparseFeature],
|
|
67
64
|
sequence_features: list[SequenceFeature],
|
|
68
|
-
|
|
69
|
-
|
|
65
|
+
bottom_mlp_params: dict,
|
|
66
|
+
tower_mlp_params_list: list[dict],
|
|
70
67
|
target: list[str],
|
|
71
68
|
task: str | list[str] | None = None,
|
|
72
69
|
**kwargs,
|
|
@@ -74,32 +71,21 @@ class ShareBottom(BaseModel):
|
|
|
74
71
|
|
|
75
72
|
self.nums_task = len(target)
|
|
76
73
|
|
|
77
|
-
resolved_task = task
|
|
78
|
-
if resolved_task is None:
|
|
79
|
-
resolved_task = self.default_task
|
|
80
|
-
elif isinstance(resolved_task, str):
|
|
81
|
-
resolved_task = [resolved_task] * self.nums_task
|
|
82
|
-
elif len(resolved_task) == 1 and self.nums_task > 1:
|
|
83
|
-
resolved_task = resolved_task * self.nums_task
|
|
84
|
-
elif len(resolved_task) != self.nums_task:
|
|
85
|
-
raise ValueError(
|
|
86
|
-
f"Length of task ({len(resolved_task)}) must match number of targets ({self.nums_task})."
|
|
87
|
-
)
|
|
88
|
-
|
|
89
74
|
super(ShareBottom, self).__init__(
|
|
90
75
|
dense_features=dense_features,
|
|
91
76
|
sparse_features=sparse_features,
|
|
92
77
|
sequence_features=sequence_features,
|
|
93
78
|
target=target,
|
|
94
|
-
task=
|
|
79
|
+
task=task,
|
|
95
80
|
**kwargs,
|
|
96
81
|
)
|
|
97
82
|
|
|
98
83
|
# Number of tasks
|
|
99
84
|
self.nums_task = len(target)
|
|
100
|
-
if len(
|
|
85
|
+
if len(tower_mlp_params_list) != self.nums_task:
|
|
101
86
|
raise ValueError(
|
|
102
|
-
|
|
87
|
+
"Number of tower mlp params "
|
|
88
|
+
f"({len(tower_mlp_params_list)}) must match number of tasks ({self.nums_task})"
|
|
103
89
|
)
|
|
104
90
|
# Embedding layer
|
|
105
91
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
@@ -110,19 +96,22 @@ class ShareBottom(BaseModel):
|
|
|
110
96
|
# input_dim = emb_dim_total + dense_input_dim
|
|
111
97
|
|
|
112
98
|
# Shared bottom network
|
|
113
|
-
self.bottom = MLP(input_dim=input_dim, output_dim=None, **
|
|
99
|
+
self.bottom = MLP(input_dim=input_dim, output_dim=None, **bottom_mlp_params)
|
|
114
100
|
self.grad_norm_shared_modules = ["embedding", "bottom"]
|
|
115
101
|
|
|
116
102
|
# Get bottom output dimension
|
|
117
|
-
if
|
|
118
|
-
|
|
103
|
+
if (
|
|
104
|
+
"hidden_dims" in bottom_mlp_params
|
|
105
|
+
and len(bottom_mlp_params["hidden_dims"]) > 0
|
|
106
|
+
):
|
|
107
|
+
bottom_output_dim = bottom_mlp_params["hidden_dims"][-1]
|
|
119
108
|
else:
|
|
120
109
|
bottom_output_dim = input_dim
|
|
121
110
|
|
|
122
111
|
# Task-specific towers
|
|
123
112
|
self.towers = nn.ModuleList()
|
|
124
|
-
for
|
|
125
|
-
tower = MLP(input_dim=bottom_output_dim, output_dim=1, **
|
|
113
|
+
for tower_mlp_params in tower_mlp_params_list:
|
|
114
|
+
tower = MLP(input_dim=bottom_output_dim, output_dim=1, **tower_mlp_params)
|
|
126
115
|
self.towers.append(tower)
|
|
127
116
|
self.prediction_layer = TaskHead(
|
|
128
117
|
task_type=self.task, task_dims=[1] * self.nums_task
|
nextrec/models/ranking/afm.py
CHANGED
|
@@ -3,8 +3,8 @@ Date: create on 09/11/2025
|
|
|
3
3
|
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
[1] Xiao J, Ye H, He X, et al. Attentional
|
|
7
|
-
|
|
6
|
+
- [1] Xiao J, Ye H, He X, et al. Attentional Factorization Machines: Learning the Weight of Feature Interactions via Attention Networks
|
|
7
|
+
URL: https://arxiv.org/abs/1708.04617
|
|
8
8
|
|
|
9
9
|
Attentional Factorization Machine (AFM) builds on FM by learning an importance
|
|
10
10
|
weight for every second-order interaction instead of treating all pairs equally.
|
|
@@ -3,10 +3,8 @@ Date: create on 09/11/2025
|
|
|
3
3
|
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
[1] Song W, Shi C, Xiao Z, et al.
|
|
7
|
-
|
|
8
|
-
on information and knowledge management. 2019: 1161-1170.
|
|
9
|
-
(https://arxiv.org/abs/1810.11921)
|
|
6
|
+
- [1] Song W, Shi C, Xiao Z, et al. AutoInt: Automatic feature interaction learning via self-attentive neural networks. In: Proceedings of the 28th ACM International Conference on Information and Knowledge Management (CIKM ’19), 2019, pp. 1161–1170.
|
|
7
|
+
URL: https://arxiv.org/abs/1810.11921
|
|
10
8
|
|
|
11
9
|
AutoInt is a CTR prediction model that leverages multi-head self-attention
|
|
12
10
|
to automatically learn high-order feature interactions in an explicit and
|
nextrec/models/ranking/dcn.py
CHANGED
|
@@ -3,9 +3,8 @@ Date: create on 09/11/2025
|
|
|
3
3
|
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
[1] Wang R, Fu B, Fu G, et al. Deep & cross network for ad click predictions[C]
|
|
7
|
-
|
|
8
|
-
(https://arxiv.org/abs/1708.05123)
|
|
6
|
+
- [1] Wang R, Fu B, Fu G, et al. Deep & cross network for ad click predictions[C] //Proceedings of the ADKDD'17. 2017: 1-7.
|
|
7
|
+
URL: https://arxiv.org/abs/1708.05123
|
|
9
8
|
|
|
10
9
|
Deep & Cross Network (DCN) mixes explicit polynomial feature crosses with a deep
|
|
11
10
|
MLP branch to capture both low-order and high-order interactions for CTR-style
|
nextrec/models/ranking/dcn_v2.py
CHANGED
|
@@ -3,9 +3,8 @@ Date: create on 09/11/2025
|
|
|
3
3
|
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
[1] R. Wang et al. DCN V2: Improved Deep & Cross Network and Practical Lessons for
|
|
7
|
-
|
|
8
|
-
(https://arxiv.org/abs/2008.13535)
|
|
6
|
+
- [1] R. Wang et al. DCN V2: Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems. KDD 2021.
|
|
7
|
+
URL: https://arxiv.org/abs/2008.13535
|
|
9
8
|
|
|
10
9
|
DCN v2 enhances the original Deep & Cross Network by replacing the scalar cross
|
|
11
10
|
weights with vector-wise (matrix) parameters and a Mixture-of-Low-Rank-Experts
|