nextrec 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/heads.py +1 -3
- nextrec/basic/loggers.py +5 -5
- nextrec/basic/model.py +210 -82
- nextrec/cli.py +5 -5
- nextrec/data/dataloader.py +93 -95
- nextrec/data/preprocessor.py +108 -46
- nextrec/loss/grad_norm.py +13 -13
- nextrec/models/multi_task/esmm.py +9 -11
- nextrec/models/multi_task/mmoe.py +18 -18
- nextrec/models/multi_task/ple.py +33 -33
- nextrec/models/multi_task/poso.py +21 -20
- nextrec/models/multi_task/share_bottom.py +16 -16
- nextrec/models/ranking/afm.py +2 -2
- nextrec/models/ranking/autoint.py +2 -2
- nextrec/models/ranking/dcn.py +2 -2
- nextrec/models/ranking/dcn_v2.py +2 -2
- nextrec/models/ranking/deepfm.py +2 -2
- nextrec/models/ranking/eulernet.py +2 -2
- nextrec/models/ranking/ffm.py +2 -2
- nextrec/models/ranking/fm.py +2 -2
- nextrec/models/ranking/lr.py +2 -2
- nextrec/models/ranking/masknet.py +2 -4
- nextrec/models/ranking/pnn.py +3 -3
- nextrec/models/ranking/widedeep.py +6 -7
- nextrec/models/ranking/xdeepfm.py +3 -3
- nextrec/utils/console.py +1 -1
- nextrec/utils/data.py +154 -32
- nextrec/utils/model.py +86 -1
- {nextrec-0.4.17.dist-info → nextrec-0.4.19.dist-info}/METADATA +8 -7
- {nextrec-0.4.17.dist-info → nextrec-0.4.19.dist-info}/RECORD +34 -34
- {nextrec-0.4.17.dist-info → nextrec-0.4.19.dist-info}/WHEEL +0 -0
- {nextrec-0.4.17.dist-info → nextrec-0.4.19.dist-info}/entry_points.txt +0 -0
- {nextrec-0.4.17.dist-info → nextrec-0.4.19.dist-info}/licenses/LICENSE +0 -0
nextrec/loss/grad_norm.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
GradNorm loss weighting for multi-task learning.
|
|
3
3
|
|
|
4
4
|
Date: create on 27/10/2025
|
|
5
|
-
Checkpoint: edit on
|
|
5
|
+
Checkpoint: edit on 24/12/2025
|
|
6
6
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
7
7
|
|
|
8
8
|
Reference:
|
|
@@ -45,7 +45,7 @@ class GradNormLossWeighting:
|
|
|
45
45
|
Adaptive multi-task loss weighting with GradNorm.
|
|
46
46
|
|
|
47
47
|
Args:
|
|
48
|
-
|
|
48
|
+
nums_task: Number of tasks.
|
|
49
49
|
alpha: GradNorm balancing strength.
|
|
50
50
|
lr: Learning rate for the weight optimizer.
|
|
51
51
|
init_weights: Optional initial weights per task.
|
|
@@ -58,7 +58,7 @@ class GradNormLossWeighting:
|
|
|
58
58
|
|
|
59
59
|
def __init__(
|
|
60
60
|
self,
|
|
61
|
-
|
|
61
|
+
nums_task: int,
|
|
62
62
|
alpha: float = 1.5,
|
|
63
63
|
lr: float = 0.025,
|
|
64
64
|
init_weights: Iterable[float] | None = None,
|
|
@@ -68,9 +68,9 @@ class GradNormLossWeighting:
|
|
|
68
68
|
init_ema_decay: float = 0.9,
|
|
69
69
|
eps: float = 1e-8,
|
|
70
70
|
) -> None:
|
|
71
|
-
if
|
|
72
|
-
raise ValueError("GradNorm requires
|
|
73
|
-
self.
|
|
71
|
+
if nums_task <= 1:
|
|
72
|
+
raise ValueError("GradNorm requires nums_task > 1.")
|
|
73
|
+
self.nums_task = nums_task
|
|
74
74
|
self.alpha = alpha
|
|
75
75
|
self.eps = eps
|
|
76
76
|
if ema_decay is not None:
|
|
@@ -87,12 +87,12 @@ class GradNormLossWeighting:
|
|
|
87
87
|
self.init_ema_count = 0
|
|
88
88
|
|
|
89
89
|
if init_weights is None:
|
|
90
|
-
weights = torch.ones(self.
|
|
90
|
+
weights = torch.ones(self.nums_task, dtype=torch.float32)
|
|
91
91
|
else:
|
|
92
92
|
weights = torch.tensor(list(init_weights), dtype=torch.float32)
|
|
93
|
-
if weights.numel() != self.
|
|
93
|
+
if weights.numel() != self.nums_task:
|
|
94
94
|
raise ValueError(
|
|
95
|
-
"init_weights length must match
|
|
95
|
+
"init_weights length must match nums_task for GradNorm."
|
|
96
96
|
)
|
|
97
97
|
if device is not None:
|
|
98
98
|
weights = weights.to(device)
|
|
@@ -123,9 +123,9 @@ class GradNormLossWeighting:
|
|
|
123
123
|
"""
|
|
124
124
|
Return weighted total loss and update task weights with GradNorm.
|
|
125
125
|
"""
|
|
126
|
-
if len(task_losses) != self.
|
|
126
|
+
if len(task_losses) != self.nums_task:
|
|
127
127
|
raise ValueError(
|
|
128
|
-
f"Expected {self.
|
|
128
|
+
f"Expected {self.nums_task} task losses, got {len(task_losses)}."
|
|
129
129
|
)
|
|
130
130
|
shared_params = [p for p in shared_params if p.requires_grad]
|
|
131
131
|
if not shared_params:
|
|
@@ -152,7 +152,7 @@ class GradNormLossWeighting:
|
|
|
152
152
|
|
|
153
153
|
weights_detached = self.weights.detach()
|
|
154
154
|
weighted_losses = [
|
|
155
|
-
weights_detached[i] * task_losses[i] for i in range(self.
|
|
155
|
+
weights_detached[i] * task_losses[i] for i in range(self.nums_task)
|
|
156
156
|
]
|
|
157
157
|
total_loss = torch.stack(weighted_losses).sum()
|
|
158
158
|
|
|
@@ -226,7 +226,7 @@ class GradNormLossWeighting:
|
|
|
226
226
|
|
|
227
227
|
with torch.no_grad():
|
|
228
228
|
w = self.weights.clamp(min=self.eps)
|
|
229
|
-
w = w * self.
|
|
229
|
+
w = w * self.nums_task / (w.sum() + self.eps)
|
|
230
230
|
self.weights.copy_(w)
|
|
231
231
|
|
|
232
232
|
self.pending_grad = None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
Checkpoint: edit on
|
|
3
|
+
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
6
|
[1] Ma X, Zhao L, Huang G, et al. Entire space multi-task model: An effective approach
|
|
@@ -101,17 +101,17 @@ class ESMM(BaseModel):
|
|
|
101
101
|
f"ESMM requires exactly 2 targets (ctr and ctcvr), got {len(target)}"
|
|
102
102
|
)
|
|
103
103
|
|
|
104
|
-
self.
|
|
104
|
+
self.nums_task = len(target)
|
|
105
105
|
resolved_task = task
|
|
106
106
|
if resolved_task is None:
|
|
107
107
|
resolved_task = self.default_task
|
|
108
108
|
elif isinstance(resolved_task, str):
|
|
109
|
-
resolved_task = [resolved_task] * self.
|
|
110
|
-
elif len(resolved_task) == 1 and self.
|
|
111
|
-
resolved_task = resolved_task * self.
|
|
112
|
-
elif len(resolved_task) != self.
|
|
109
|
+
resolved_task = [resolved_task] * self.nums_task
|
|
110
|
+
elif len(resolved_task) == 1 and self.nums_task > 1:
|
|
111
|
+
resolved_task = resolved_task * self.nums_task
|
|
112
|
+
elif len(resolved_task) != self.nums_task:
|
|
113
113
|
raise ValueError(
|
|
114
|
-
f"Length of task ({len(resolved_task)}) must match number of targets ({self.
|
|
114
|
+
f"Length of task ({len(resolved_task)}) must match number of targets ({self.nums_task})."
|
|
115
115
|
)
|
|
116
116
|
# resolved_task is now guaranteed to be a list[str]
|
|
117
117
|
|
|
@@ -140,9 +140,7 @@ class ESMM(BaseModel):
|
|
|
140
140
|
# CVR tower
|
|
141
141
|
self.cvr_tower = MLP(input_dim=input_dim, output_layer=True, **cvr_params)
|
|
142
142
|
self.grad_norm_shared_modules = ["embedding"]
|
|
143
|
-
self.prediction_layer = TaskHead(
|
|
144
|
-
task_type=self.default_task, task_dims=[1, 1]
|
|
145
|
-
)
|
|
143
|
+
self.prediction_layer = TaskHead(task_type=self.default_task, task_dims=[1, 1])
|
|
146
144
|
# Register regularization weights
|
|
147
145
|
self.register_regularization_weights(
|
|
148
146
|
embedding_attr="embedding", include_modules=["ctr_tower", "cvr_tower"]
|
|
@@ -168,4 +166,4 @@ class ESMM(BaseModel):
|
|
|
168
166
|
|
|
169
167
|
# Output: [CTR, CTCVR], We supervise CTR with click labels and CTCVR with conversion labels
|
|
170
168
|
y = torch.cat([ctr, ctcvr], dim=1) # [B, 2]
|
|
171
|
-
return y # [B, 2], where y[:, 0] is CTR and y[:, 1] is CTCVR
|
|
169
|
+
return y # [B, 2], where y[:, 0] is CTR and y[:, 1] is CTCVR
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
Checkpoint: edit on
|
|
3
|
+
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
6
|
[1] Ma J, Zhao Z, Yi X, et al. Modeling task relationships in multi-task learning with
|
|
@@ -67,9 +67,9 @@ class MMOE(BaseModel):
|
|
|
67
67
|
|
|
68
68
|
@property
|
|
69
69
|
def default_task(self):
|
|
70
|
-
|
|
71
|
-
if
|
|
72
|
-
return ["binary"] *
|
|
70
|
+
nums_task = getattr(self, "nums_task", None)
|
|
71
|
+
if nums_task is not None and nums_task > 0:
|
|
72
|
+
return ["binary"] * nums_task
|
|
73
73
|
return ["binary"]
|
|
74
74
|
|
|
75
75
|
def __init__(
|
|
@@ -107,18 +107,18 @@ class MMOE(BaseModel):
|
|
|
107
107
|
elif isinstance(target, str):
|
|
108
108
|
target = [target]
|
|
109
109
|
|
|
110
|
-
self.
|
|
110
|
+
self.nums_task = len(target) if target else 1
|
|
111
111
|
|
|
112
112
|
resolved_task = task
|
|
113
113
|
if resolved_task is None:
|
|
114
114
|
resolved_task = self.default_task
|
|
115
115
|
elif isinstance(resolved_task, str):
|
|
116
|
-
resolved_task = [resolved_task] * self.
|
|
117
|
-
elif len(resolved_task) == 1 and self.
|
|
118
|
-
resolved_task = resolved_task * self.
|
|
119
|
-
elif len(resolved_task) != self.
|
|
116
|
+
resolved_task = [resolved_task] * self.nums_task
|
|
117
|
+
elif len(resolved_task) == 1 and self.nums_task > 1:
|
|
118
|
+
resolved_task = resolved_task * self.nums_task
|
|
119
|
+
elif len(resolved_task) != self.nums_task:
|
|
120
120
|
raise ValueError(
|
|
121
|
-
f"Length of task ({len(resolved_task)}) must match number of targets ({self.
|
|
121
|
+
f"Length of task ({len(resolved_task)}) must match number of targets ({self.nums_task})."
|
|
122
122
|
)
|
|
123
123
|
|
|
124
124
|
super(MMOE, self).__init__(
|
|
@@ -138,12 +138,12 @@ class MMOE(BaseModel):
|
|
|
138
138
|
self.loss = loss
|
|
139
139
|
|
|
140
140
|
# Number of tasks and experts
|
|
141
|
-
self.
|
|
141
|
+
self.nums_task = len(target)
|
|
142
142
|
self.num_experts = num_experts
|
|
143
143
|
|
|
144
|
-
if len(tower_params_list) != self.
|
|
144
|
+
if len(tower_params_list) != self.nums_task:
|
|
145
145
|
raise ValueError(
|
|
146
|
-
f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.
|
|
146
|
+
f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.nums_task})"
|
|
147
147
|
)
|
|
148
148
|
|
|
149
149
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
@@ -163,7 +163,7 @@ class MMOE(BaseModel):
|
|
|
163
163
|
|
|
164
164
|
# Task-specific gates
|
|
165
165
|
self.gates = nn.ModuleList()
|
|
166
|
-
for _ in range(self.
|
|
166
|
+
for _ in range(self.nums_task):
|
|
167
167
|
gate = nn.Sequential(nn.Linear(input_dim, num_experts), nn.Softmax(dim=1))
|
|
168
168
|
self.gates.append(gate)
|
|
169
169
|
self.grad_norm_shared_modules = ["embedding", "experts", "gates"]
|
|
@@ -174,7 +174,7 @@ class MMOE(BaseModel):
|
|
|
174
174
|
tower = MLP(input_dim=expert_output_dim, output_layer=True, **tower_params)
|
|
175
175
|
self.towers.append(tower)
|
|
176
176
|
self.prediction_layer = TaskHead(
|
|
177
|
-
task_type=self.default_task, task_dims=[1] * self.
|
|
177
|
+
task_type=self.default_task, task_dims=[1] * self.nums_task
|
|
178
178
|
)
|
|
179
179
|
# Register regularization weights
|
|
180
180
|
self.register_regularization_weights(
|
|
@@ -199,7 +199,7 @@ class MMOE(BaseModel):
|
|
|
199
199
|
|
|
200
200
|
# Task-specific processing
|
|
201
201
|
task_outputs = []
|
|
202
|
-
for task_idx in range(self.
|
|
202
|
+
for task_idx in range(self.nums_task):
|
|
203
203
|
# Gate weights for this task: [B, num_experts]
|
|
204
204
|
gate_weights = self.gates[task_idx](input_flat) # [B, num_experts]
|
|
205
205
|
|
|
@@ -218,6 +218,6 @@ class MMOE(BaseModel):
|
|
|
218
218
|
tower_output = self.towers[task_idx](gated_output) # [B, 1]
|
|
219
219
|
task_outputs.append(tower_output)
|
|
220
220
|
|
|
221
|
-
# Stack outputs: [B,
|
|
221
|
+
# Stack outputs: [B, nums_task]
|
|
222
222
|
y = torch.cat(task_outputs, dim=1)
|
|
223
|
-
return self.prediction_layer(y)
|
|
223
|
+
return self.prediction_layer(y)
|
nextrec/models/multi_task/ple.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
Checkpoint: edit on
|
|
3
|
+
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
6
|
[1] Tang H, Liu J, Zhao M, et al. Progressive layered extraction (PLE): A novel
|
|
@@ -64,18 +64,18 @@ class CGCLayer(nn.Module):
|
|
|
64
64
|
def __init__(
|
|
65
65
|
self,
|
|
66
66
|
input_dim: int,
|
|
67
|
-
|
|
67
|
+
nums_task: int,
|
|
68
68
|
num_shared_experts: int,
|
|
69
69
|
num_specific_experts: int,
|
|
70
70
|
shared_expert_params: dict,
|
|
71
71
|
specific_expert_params: dict | list[dict],
|
|
72
72
|
):
|
|
73
73
|
super().__init__()
|
|
74
|
-
if
|
|
75
|
-
raise ValueError("
|
|
74
|
+
if nums_task < 1:
|
|
75
|
+
raise ValueError("nums_task must be >= 1")
|
|
76
76
|
|
|
77
77
|
specific_params_list = self.normalize_specific_params(
|
|
78
|
-
specific_expert_params,
|
|
78
|
+
specific_expert_params, nums_task
|
|
79
79
|
)
|
|
80
80
|
|
|
81
81
|
self.output_dim = get_mlp_output_dim(shared_expert_params, input_dim)
|
|
@@ -121,23 +121,23 @@ class CGCLayer(nn.Module):
|
|
|
121
121
|
nn.Linear(input_dim, task_gate_expert_num),
|
|
122
122
|
nn.Softmax(dim=1),
|
|
123
123
|
)
|
|
124
|
-
for _ in range(
|
|
124
|
+
for _ in range(nums_task)
|
|
125
125
|
]
|
|
126
126
|
)
|
|
127
|
-
shared_gate_expert_num = num_shared_experts + num_specific_experts *
|
|
127
|
+
shared_gate_expert_num = num_shared_experts + num_specific_experts * nums_task
|
|
128
128
|
self.shared_gate = nn.Sequential(
|
|
129
129
|
nn.Linear(input_dim, shared_gate_expert_num),
|
|
130
130
|
nn.Softmax(dim=1),
|
|
131
131
|
)
|
|
132
132
|
|
|
133
|
-
self.
|
|
133
|
+
self.nums_task = nums_task
|
|
134
134
|
|
|
135
135
|
def forward(
|
|
136
136
|
self, task_inputs: list[torch.Tensor], shared_input: torch.Tensor
|
|
137
137
|
) -> tuple[list[torch.Tensor], torch.Tensor]:
|
|
138
|
-
if len(task_inputs) != self.
|
|
138
|
+
if len(task_inputs) != self.nums_task:
|
|
139
139
|
raise ValueError(
|
|
140
|
-
f"Expected {self.
|
|
140
|
+
f"Expected {self.nums_task} task inputs, got {len(task_inputs)}"
|
|
141
141
|
)
|
|
142
142
|
|
|
143
143
|
shared_outputs = [expert(shared_input) for expert in self.shared_experts]
|
|
@@ -146,7 +146,7 @@ class CGCLayer(nn.Module):
|
|
|
146
146
|
new_task_fea: list[torch.Tensor] = []
|
|
147
147
|
all_specific_for_shared: list[torch.Tensor] = []
|
|
148
148
|
|
|
149
|
-
for task_idx in range(self.
|
|
149
|
+
for task_idx in range(self.nums_task):
|
|
150
150
|
task_input = task_inputs[task_idx]
|
|
151
151
|
task_specific_outputs = [expert(task_input) for expert in self.specific_experts[task_idx]] # type: ignore
|
|
152
152
|
all_specific_for_shared.extend(task_specific_outputs)
|
|
@@ -168,15 +168,15 @@ class CGCLayer(nn.Module):
|
|
|
168
168
|
|
|
169
169
|
@staticmethod
|
|
170
170
|
def normalize_specific_params(
|
|
171
|
-
params: dict | list[dict],
|
|
171
|
+
params: dict | list[dict], nums_task: int
|
|
172
172
|
) -> list[dict]:
|
|
173
173
|
if isinstance(params, list):
|
|
174
|
-
if len(params) !=
|
|
174
|
+
if len(params) != nums_task:
|
|
175
175
|
raise ValueError(
|
|
176
|
-
f"Length of specific_expert_params ({len(params)}) must match
|
|
176
|
+
f"Length of specific_expert_params ({len(params)}) must match nums_task ({nums_task})."
|
|
177
177
|
)
|
|
178
178
|
return [p.copy() for p in params]
|
|
179
|
-
return [params.copy() for _ in range(
|
|
179
|
+
return [params.copy() for _ in range(nums_task)]
|
|
180
180
|
|
|
181
181
|
|
|
182
182
|
class PLE(BaseModel):
|
|
@@ -195,9 +195,9 @@ class PLE(BaseModel):
|
|
|
195
195
|
|
|
196
196
|
@property
|
|
197
197
|
def default_task(self):
|
|
198
|
-
|
|
199
|
-
if
|
|
200
|
-
return ["binary"] *
|
|
198
|
+
nums_task = getattr(self, "nums_task", None)
|
|
199
|
+
if nums_task is not None and nums_task > 0:
|
|
200
|
+
return ["binary"] * nums_task
|
|
201
201
|
return ["binary"]
|
|
202
202
|
|
|
203
203
|
def __init__(
|
|
@@ -225,18 +225,18 @@ class PLE(BaseModel):
|
|
|
225
225
|
**kwargs,
|
|
226
226
|
):
|
|
227
227
|
|
|
228
|
-
self.
|
|
228
|
+
self.nums_task = len(target)
|
|
229
229
|
|
|
230
230
|
resolved_task = task
|
|
231
231
|
if resolved_task is None:
|
|
232
232
|
resolved_task = self.default_task
|
|
233
233
|
elif isinstance(resolved_task, str):
|
|
234
|
-
resolved_task = [resolved_task] * self.
|
|
235
|
-
elif len(resolved_task) == 1 and self.
|
|
236
|
-
resolved_task = resolved_task * self.
|
|
237
|
-
elif len(resolved_task) != self.
|
|
234
|
+
resolved_task = [resolved_task] * self.nums_task
|
|
235
|
+
elif len(resolved_task) == 1 and self.nums_task > 1:
|
|
236
|
+
resolved_task = resolved_task * self.nums_task
|
|
237
|
+
elif len(resolved_task) != self.nums_task:
|
|
238
238
|
raise ValueError(
|
|
239
|
-
f"Length of task ({len(resolved_task)}) must match number of targets ({self.
|
|
239
|
+
f"Length of task ({len(resolved_task)}) must match number of targets ({self.nums_task})."
|
|
240
240
|
)
|
|
241
241
|
|
|
242
242
|
super(PLE, self).__init__(
|
|
@@ -257,15 +257,15 @@ class PLE(BaseModel):
|
|
|
257
257
|
if self.loss is None:
|
|
258
258
|
self.loss = "bce"
|
|
259
259
|
# Number of tasks, experts, and levels
|
|
260
|
-
self.
|
|
260
|
+
self.nums_task = len(target)
|
|
261
261
|
self.num_shared_experts = num_shared_experts
|
|
262
262
|
self.num_specific_experts = num_specific_experts
|
|
263
263
|
self.num_levels = num_levels
|
|
264
264
|
if optimizer_params is None:
|
|
265
265
|
optimizer_params = {}
|
|
266
|
-
if len(tower_params_list) != self.
|
|
266
|
+
if len(tower_params_list) != self.nums_task:
|
|
267
267
|
raise ValueError(
|
|
268
|
-
f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.
|
|
268
|
+
f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.nums_task})"
|
|
269
269
|
)
|
|
270
270
|
# Embedding layer
|
|
271
271
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
@@ -288,7 +288,7 @@ class PLE(BaseModel):
|
|
|
288
288
|
level_input_dim = input_dim if level == 0 else expert_output_dim
|
|
289
289
|
cgc_layer = CGCLayer(
|
|
290
290
|
input_dim=level_input_dim,
|
|
291
|
-
|
|
291
|
+
nums_task=self.nums_task,
|
|
292
292
|
num_shared_experts=num_shared_experts,
|
|
293
293
|
num_specific_experts=num_specific_experts,
|
|
294
294
|
shared_expert_params=shared_expert_params,
|
|
@@ -304,7 +304,7 @@ class PLE(BaseModel):
|
|
|
304
304
|
tower = MLP(input_dim=expert_output_dim, output_layer=True, **tower_params)
|
|
305
305
|
self.towers.append(tower)
|
|
306
306
|
self.prediction_layer = TaskHead(
|
|
307
|
-
task_type=self.default_task, task_dims=[1] * self.
|
|
307
|
+
task_type=self.default_task, task_dims=[1] * self.nums_task
|
|
308
308
|
)
|
|
309
309
|
# Register regularization weights
|
|
310
310
|
self.register_regularization_weights(
|
|
@@ -322,7 +322,7 @@ class PLE(BaseModel):
|
|
|
322
322
|
input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
|
|
323
323
|
|
|
324
324
|
# Initial features for each task and shared
|
|
325
|
-
task_fea = [input_flat for _ in range(self.
|
|
325
|
+
task_fea = [input_flat for _ in range(self.nums_task)]
|
|
326
326
|
shared_fea = input_flat
|
|
327
327
|
|
|
328
328
|
# Progressive Layered Extraction: CGC
|
|
@@ -331,10 +331,10 @@ class PLE(BaseModel):
|
|
|
331
331
|
|
|
332
332
|
# task tower
|
|
333
333
|
task_outputs = []
|
|
334
|
-
for task_idx in range(self.
|
|
334
|
+
for task_idx in range(self.nums_task):
|
|
335
335
|
tower_output = self.towers[task_idx](task_fea[task_idx]) # [B, 1]
|
|
336
336
|
task_outputs.append(tower_output)
|
|
337
337
|
|
|
338
|
-
# [B,
|
|
338
|
+
# [B, nums_task]
|
|
339
339
|
y = torch.cat(task_outputs, dim=1)
|
|
340
|
-
return self.prediction_layer(y)
|
|
340
|
+
return self.prediction_layer(y)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 28/11/2025
|
|
3
|
+
Checkpoint: edit on 23/12/2025
|
|
3
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
4
5
|
Reference:
|
|
5
6
|
[1] Wang et al. "POSO: Personalized Cold Start Modules for Large-scale Recommender Systems", 2021.
|
|
@@ -196,7 +197,7 @@ class POSOMMoE(nn.Module):
|
|
|
196
197
|
pc_dim: int, # for poso feature dimension
|
|
197
198
|
num_experts: int,
|
|
198
199
|
expert_hidden_dims: list[int],
|
|
199
|
-
|
|
200
|
+
nums_task: int,
|
|
200
201
|
activation: str = "relu",
|
|
201
202
|
expert_dropout: float = 0.0,
|
|
202
203
|
gate_hidden_dim: int = 32, # for poso gate hidden dimension
|
|
@@ -205,7 +206,7 @@ class POSOMMoE(nn.Module):
|
|
|
205
206
|
) -> None:
|
|
206
207
|
super().__init__()
|
|
207
208
|
self.num_experts = num_experts
|
|
208
|
-
self.
|
|
209
|
+
self.nums_task = nums_task
|
|
209
210
|
|
|
210
211
|
# Experts built with framework MLP, same as standard MMoE
|
|
211
212
|
self.experts = nn.ModuleList(
|
|
@@ -226,7 +227,7 @@ class POSOMMoE(nn.Module):
|
|
|
226
227
|
|
|
227
228
|
# Task-specific gates: gate_t(x) over experts
|
|
228
229
|
self.gates = nn.ModuleList(
|
|
229
|
-
[nn.Linear(input_dim, num_experts) for _ in range(
|
|
230
|
+
[nn.Linear(input_dim, num_experts) for _ in range(nums_task)]
|
|
230
231
|
)
|
|
231
232
|
self.gate_use_softmax = gate_use_softmax
|
|
232
233
|
|
|
@@ -248,7 +249,7 @@ class POSOMMoE(nn.Module):
|
|
|
248
249
|
"""
|
|
249
250
|
x: (B, input_dim)
|
|
250
251
|
pc: (B, pc_dim)
|
|
251
|
-
return: list of task outputs z_t with length
|
|
252
|
+
return: list of task outputs z_t with length nums_task, each (B, D)
|
|
252
253
|
"""
|
|
253
254
|
# 1) Expert outputs with POSO PC gate
|
|
254
255
|
masked_expert_outputs = []
|
|
@@ -262,7 +263,7 @@ class POSOMMoE(nn.Module):
|
|
|
262
263
|
|
|
263
264
|
# 2) Task gates depend on x as in standard MMoE
|
|
264
265
|
task_outputs: list[torch.Tensor] = []
|
|
265
|
-
for t in range(self.
|
|
266
|
+
for t in range(self.nums_task):
|
|
266
267
|
logits = self.gates[t](x) # (B, E)
|
|
267
268
|
if self.gate_use_softmax:
|
|
268
269
|
gate = F.softmax(logits, dim=1)
|
|
@@ -289,9 +290,9 @@ class POSO(BaseModel):
|
|
|
289
290
|
|
|
290
291
|
@property
|
|
291
292
|
def default_task(self) -> list[str]:
|
|
292
|
-
|
|
293
|
-
if
|
|
294
|
-
return ["binary"] *
|
|
293
|
+
nums_task = getattr(self, "nums_task", None)
|
|
294
|
+
if nums_task is not None and nums_task > 0:
|
|
295
|
+
return ["binary"] * nums_task
|
|
295
296
|
return ["binary"]
|
|
296
297
|
|
|
297
298
|
def __init__(
|
|
@@ -333,24 +334,24 @@ class POSO(BaseModel):
|
|
|
333
334
|
dense_l2_reg: float = 1e-4,
|
|
334
335
|
**kwargs,
|
|
335
336
|
):
|
|
336
|
-
self.
|
|
337
|
+
self.nums_task = len(target)
|
|
337
338
|
|
|
338
|
-
# Normalize task to match
|
|
339
|
+
# Normalize task to match nums_task
|
|
339
340
|
resolved_task = task
|
|
340
341
|
if resolved_task is None:
|
|
341
342
|
resolved_task = self.default_task
|
|
342
343
|
elif isinstance(resolved_task, str):
|
|
343
|
-
resolved_task = [resolved_task] * self.
|
|
344
|
-
elif len(resolved_task) == 1 and self.
|
|
345
|
-
resolved_task = resolved_task * self.
|
|
346
|
-
elif len(resolved_task) != self.
|
|
344
|
+
resolved_task = [resolved_task] * self.nums_task
|
|
345
|
+
elif len(resolved_task) == 1 and self.nums_task > 1:
|
|
346
|
+
resolved_task = resolved_task * self.nums_task
|
|
347
|
+
elif len(resolved_task) != self.nums_task:
|
|
347
348
|
raise ValueError(
|
|
348
|
-
f"Length of task ({len(resolved_task)}) must match number of targets ({self.
|
|
349
|
+
f"Length of task ({len(resolved_task)}) must match number of targets ({self.nums_task})."
|
|
349
350
|
)
|
|
350
351
|
|
|
351
|
-
if len(tower_params_list) != self.
|
|
352
|
+
if len(tower_params_list) != self.nums_task:
|
|
352
353
|
raise ValueError(
|
|
353
|
-
f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.
|
|
354
|
+
f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.nums_task})"
|
|
354
355
|
)
|
|
355
356
|
|
|
356
357
|
super().__init__(
|
|
@@ -466,7 +467,7 @@ class POSO(BaseModel):
|
|
|
466
467
|
pc_dim=self.pc_input_dim,
|
|
467
468
|
num_experts=num_experts,
|
|
468
469
|
expert_hidden_dims=expert_hidden_dims,
|
|
469
|
-
|
|
470
|
+
nums_task=self.nums_task,
|
|
470
471
|
activation=expert_activation,
|
|
471
472
|
expert_dropout=expert_dropout,
|
|
472
473
|
gate_hidden_dim=expert_gate_hidden_dim,
|
|
@@ -490,7 +491,7 @@ class POSO(BaseModel):
|
|
|
490
491
|
self.grad_norm_shared_modules = ["embedding", "mmoe"]
|
|
491
492
|
self.prediction_layer = TaskHead(
|
|
492
493
|
task_type=self.default_task,
|
|
493
|
-
task_dims=[1] * self.
|
|
494
|
+
task_dims=[1] * self.nums_task,
|
|
494
495
|
)
|
|
495
496
|
include_modules = (
|
|
496
497
|
["towers", "tower_heads"]
|
|
@@ -525,4 +526,4 @@ class POSO(BaseModel):
|
|
|
525
526
|
task_outputs.append(logit)
|
|
526
527
|
|
|
527
528
|
y = torch.cat(task_outputs, dim=1)
|
|
528
|
-
return self.prediction_layer(y)
|
|
529
|
+
return self.prediction_layer(y)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
Checkpoint: edit on
|
|
3
|
+
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
6
|
[1] Caruana R. Multitask learning[J]. Machine Learning, 1997, 28: 41-75.
|
|
@@ -55,9 +55,9 @@ class ShareBottom(BaseModel):
|
|
|
55
55
|
|
|
56
56
|
@property
|
|
57
57
|
def default_task(self):
|
|
58
|
-
|
|
59
|
-
if
|
|
60
|
-
return ["binary"] *
|
|
58
|
+
nums_task = getattr(self, "nums_task", None)
|
|
59
|
+
if nums_task is not None and nums_task > 0:
|
|
60
|
+
return ["binary"] * nums_task
|
|
61
61
|
return ["binary"]
|
|
62
62
|
|
|
63
63
|
def __init__(
|
|
@@ -83,18 +83,18 @@ class ShareBottom(BaseModel):
|
|
|
83
83
|
|
|
84
84
|
optimizer_params = optimizer_params or {}
|
|
85
85
|
|
|
86
|
-
self.
|
|
86
|
+
self.nums_task = len(target)
|
|
87
87
|
|
|
88
88
|
resolved_task = task
|
|
89
89
|
if resolved_task is None:
|
|
90
90
|
resolved_task = self.default_task
|
|
91
91
|
elif isinstance(resolved_task, str):
|
|
92
|
-
resolved_task = [resolved_task] * self.
|
|
93
|
-
elif len(resolved_task) == 1 and self.
|
|
94
|
-
resolved_task = resolved_task * self.
|
|
95
|
-
elif len(resolved_task) != self.
|
|
92
|
+
resolved_task = [resolved_task] * self.nums_task
|
|
93
|
+
elif len(resolved_task) == 1 and self.nums_task > 1:
|
|
94
|
+
resolved_task = resolved_task * self.nums_task
|
|
95
|
+
elif len(resolved_task) != self.nums_task:
|
|
96
96
|
raise ValueError(
|
|
97
|
-
f"Length of task ({len(resolved_task)}) must match number of targets ({self.
|
|
97
|
+
f"Length of task ({len(resolved_task)}) must match number of targets ({self.nums_task})."
|
|
98
98
|
)
|
|
99
99
|
|
|
100
100
|
super(ShareBottom, self).__init__(
|
|
@@ -115,10 +115,10 @@ class ShareBottom(BaseModel):
|
|
|
115
115
|
if self.loss is None:
|
|
116
116
|
self.loss = "bce"
|
|
117
117
|
# Number of tasks
|
|
118
|
-
self.
|
|
119
|
-
if len(tower_params_list) != self.
|
|
118
|
+
self.nums_task = len(target)
|
|
119
|
+
if len(tower_params_list) != self.nums_task:
|
|
120
120
|
raise ValueError(
|
|
121
|
-
f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.
|
|
121
|
+
f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.nums_task})"
|
|
122
122
|
)
|
|
123
123
|
# Embedding layer
|
|
124
124
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
@@ -144,7 +144,7 @@ class ShareBottom(BaseModel):
|
|
|
144
144
|
tower = MLP(input_dim=bottom_output_dim, output_layer=True, **tower_params)
|
|
145
145
|
self.towers.append(tower)
|
|
146
146
|
self.prediction_layer = TaskHead(
|
|
147
|
-
task_type=self.default_task, task_dims=[1] * self.
|
|
147
|
+
task_type=self.default_task, task_dims=[1] * self.nums_task
|
|
148
148
|
)
|
|
149
149
|
# Register regularization weights
|
|
150
150
|
self.register_regularization_weights(
|
|
@@ -170,6 +170,6 @@ class ShareBottom(BaseModel):
|
|
|
170
170
|
tower_output = tower(bottom_output) # [B, 1]
|
|
171
171
|
task_outputs.append(tower_output)
|
|
172
172
|
|
|
173
|
-
# Stack outputs: [B,
|
|
173
|
+
# Stack outputs: [B, nums_task]
|
|
174
174
|
y = torch.cat(task_outputs, dim=1)
|
|
175
|
-
return self.prediction_layer(y)
|
|
175
|
+
return self.prediction_layer(y)
|
nextrec/models/ranking/afm.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
Checkpoint: edit on
|
|
3
|
+
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
6
|
[1] Xiao J, Ye H, He X, et al. Attentional factorization machines: Learning the weight of
|
|
@@ -244,4 +244,4 @@ class AFM(BaseModel):
|
|
|
244
244
|
y_afm = self.output_projection(weighted_sum)
|
|
245
245
|
|
|
246
246
|
y = y_linear + y_afm
|
|
247
|
-
return self.prediction_layer(y)
|
|
247
|
+
return self.prediction_layer(y)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
Checkpoint: edit on
|
|
3
|
+
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
6
|
[1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
|
|
@@ -207,4 +207,4 @@ class AutoInt(BaseModel):
|
|
|
207
207
|
start_dim=1
|
|
208
208
|
) # [B, num_fields * att_embedding_dim]
|
|
209
209
|
y = self.fc(attention_output_flat) # [B, 1]
|
|
210
|
-
return self.prediction_layer(y)
|
|
210
|
+
return self.prediction_layer(y)
|
nextrec/models/ranking/dcn.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
Checkpoint: edit on
|
|
3
|
+
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
6
|
[1] Wang R, Fu B, Fu G, et al. Deep & cross network for ad click predictions[C]
|
|
@@ -198,4 +198,4 @@ class DCN(BaseModel):
|
|
|
198
198
|
|
|
199
199
|
# Final prediction
|
|
200
200
|
y = self.final_layer(combined)
|
|
201
|
-
return self.prediction_layer(y)
|
|
201
|
+
return self.prediction_layer(y)
|
nextrec/models/ranking/dcn_v2.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
Checkpoint: edit on
|
|
3
|
+
Checkpoint: edit on 23/12/2025
|
|
4
4
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
6
|
[1] R. Wang et al. DCN V2: Improved Deep & Cross Network and Practical Lessons for
|
|
@@ -302,4 +302,4 @@ class DCNv2(BaseModel):
|
|
|
302
302
|
combined = cross_out
|
|
303
303
|
|
|
304
304
|
logit = self.final_layer(combined)
|
|
305
|
-
return self.prediction_layer(logit)
|
|
305
|
+
return self.prediction_layer(logit)
|