nextrec 0.1.4__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +4 -4
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +9 -10
- nextrec/basic/callback.py +0 -1
- nextrec/basic/dataloader.py +127 -168
- nextrec/basic/features.py +27 -24
- nextrec/basic/layers.py +159 -328
- nextrec/basic/loggers.py +37 -50
- nextrec/basic/metrics.py +147 -255
- nextrec/basic/model.py +462 -817
- nextrec/data/__init__.py +5 -5
- nextrec/data/data_utils.py +12 -16
- nextrec/data/preprocessor.py +252 -276
- nextrec/loss/__init__.py +12 -12
- nextrec/loss/loss_utils.py +22 -30
- nextrec/loss/match_losses.py +83 -116
- nextrec/models/match/__init__.py +5 -5
- nextrec/models/match/dssm.py +61 -70
- nextrec/models/match/dssm_v2.py +51 -61
- nextrec/models/match/mind.py +71 -89
- nextrec/models/match/sdm.py +81 -93
- nextrec/models/match/youtube_dnn.py +53 -62
- nextrec/models/multi_task/esmm.py +43 -49
- nextrec/models/multi_task/mmoe.py +56 -65
- nextrec/models/multi_task/ple.py +65 -92
- nextrec/models/multi_task/share_bottom.py +42 -48
- nextrec/models/ranking/__init__.py +7 -7
- nextrec/models/ranking/afm.py +30 -39
- nextrec/models/ranking/autoint.py +57 -70
- nextrec/models/ranking/dcn.py +35 -43
- nextrec/models/ranking/deepfm.py +28 -34
- nextrec/models/ranking/dien.py +79 -115
- nextrec/models/ranking/din.py +60 -84
- nextrec/models/ranking/fibinet.py +35 -51
- nextrec/models/ranking/fm.py +26 -28
- nextrec/models/ranking/masknet.py +31 -31
- nextrec/models/ranking/pnn.py +31 -30
- nextrec/models/ranking/widedeep.py +31 -36
- nextrec/models/ranking/xdeepfm.py +39 -46
- nextrec/utils/__init__.py +9 -9
- nextrec/utils/embedding.py +1 -1
- nextrec/utils/initializer.py +15 -23
- nextrec/utils/optimizer.py +10 -14
- {nextrec-0.1.4.dist-info → nextrec-0.1.8.dist-info}/METADATA +16 -7
- nextrec-0.1.8.dist-info/RECORD +51 -0
- nextrec-0.1.4.dist-info/RECORD +0 -51
- {nextrec-0.1.4.dist-info → nextrec-0.1.8.dist-info}/WHEEL +0 -0
- {nextrec-0.1.4.dist-info → nextrec-0.1.8.dist-info}/licenses/LICENSE +0 -0
nextrec/models/multi_task/ple.py
CHANGED
|
@@ -17,13 +17,13 @@ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
|
17
17
|
class PLE(BaseModel):
|
|
18
18
|
"""
|
|
19
19
|
Progressive Layered Extraction
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
PLE is an advanced multi-task learning model that extends MMOE by introducing
|
|
22
22
|
both task-specific experts and shared experts at each level. It uses a progressive
|
|
23
23
|
routing mechanism where experts from level k feed into gates at level k+1.
|
|
24
24
|
This design better captures task-specific and shared information progressively.
|
|
25
25
|
"""
|
|
26
|
-
|
|
26
|
+
|
|
27
27
|
@property
|
|
28
28
|
def model_name(self):
|
|
29
29
|
return "PLE"
|
|
@@ -31,31 +31,29 @@ class PLE(BaseModel):
|
|
|
31
31
|
@property
|
|
32
32
|
def task_type(self):
|
|
33
33
|
return self.task if isinstance(self.task, list) else [self.task]
|
|
34
|
-
|
|
35
|
-
def __init__(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
):
|
|
58
|
-
|
|
34
|
+
|
|
35
|
+
def __init__(self,
|
|
36
|
+
dense_features: list[DenseFeature],
|
|
37
|
+
sparse_features: list[SparseFeature],
|
|
38
|
+
sequence_features: list[SequenceFeature],
|
|
39
|
+
shared_expert_params: dict,
|
|
40
|
+
specific_expert_params: dict,
|
|
41
|
+
num_shared_experts: int,
|
|
42
|
+
num_specific_experts: int,
|
|
43
|
+
num_levels: int,
|
|
44
|
+
tower_params_list: list[dict],
|
|
45
|
+
target: list[str],
|
|
46
|
+
task: str | list[str] = 'binary',
|
|
47
|
+
optimizer: str = "adam",
|
|
48
|
+
optimizer_params: dict = {},
|
|
49
|
+
loss: str | nn.Module | list[str | nn.Module] | None = "bce",
|
|
50
|
+
device: str = 'cpu',
|
|
51
|
+
model_id: str = "baseline",
|
|
52
|
+
embedding_l1_reg=1e-6,
|
|
53
|
+
dense_l1_reg=1e-5,
|
|
54
|
+
embedding_l2_reg=1e-5,
|
|
55
|
+
dense_l2_reg=1e-4):
|
|
56
|
+
|
|
59
57
|
super(PLE, self).__init__(
|
|
60
58
|
dense_features=dense_features,
|
|
61
59
|
sparse_features=sparse_features,
|
|
@@ -68,13 +66,13 @@ class PLE(BaseModel):
|
|
|
68
66
|
embedding_l2_reg=embedding_l2_reg,
|
|
69
67
|
dense_l2_reg=dense_l2_reg,
|
|
70
68
|
early_stop_patience=20,
|
|
71
|
-
model_id=model_id
|
|
69
|
+
model_id=model_id
|
|
72
70
|
)
|
|
73
71
|
|
|
74
72
|
self.loss = loss
|
|
75
73
|
if self.loss is None:
|
|
76
74
|
self.loss = "bce"
|
|
77
|
-
|
|
75
|
+
|
|
78
76
|
# Number of tasks, experts, and levels
|
|
79
77
|
self.num_tasks = len(target)
|
|
80
78
|
self.num_shared_experts = num_shared_experts
|
|
@@ -82,12 +80,10 @@ class PLE(BaseModel):
|
|
|
82
80
|
self.num_levels = num_levels
|
|
83
81
|
if optimizer_params is None:
|
|
84
82
|
optimizer_params = {}
|
|
85
|
-
|
|
83
|
+
|
|
86
84
|
if len(tower_params_list) != self.num_tasks:
|
|
87
|
-
raise ValueError(
|
|
88
|
-
|
|
89
|
-
)
|
|
90
|
-
|
|
85
|
+
raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
|
|
86
|
+
|
|
91
87
|
# All features
|
|
92
88
|
self.all_features = dense_features + sparse_features + sequence_features
|
|
93
89
|
|
|
@@ -95,60 +91,42 @@ class PLE(BaseModel):
|
|
|
95
91
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
96
92
|
|
|
97
93
|
# Calculate input dimension
|
|
98
|
-
emb_dim_total = sum(
|
|
99
|
-
|
|
100
|
-
f.embedding_dim
|
|
101
|
-
for f in self.all_features
|
|
102
|
-
if not isinstance(f, DenseFeature)
|
|
103
|
-
]
|
|
104
|
-
)
|
|
105
|
-
dense_input_dim = sum(
|
|
106
|
-
[getattr(f, "embedding_dim", 1) or 1 for f in dense_features]
|
|
107
|
-
)
|
|
94
|
+
emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
|
|
95
|
+
dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
|
|
108
96
|
input_dim = emb_dim_total + dense_input_dim
|
|
109
|
-
|
|
97
|
+
|
|
110
98
|
# Get expert output dimension
|
|
111
|
-
if
|
|
112
|
-
expert_output_dim = shared_expert_params[
|
|
99
|
+
if 'dims' in shared_expert_params and len(shared_expert_params['dims']) > 0:
|
|
100
|
+
expert_output_dim = shared_expert_params['dims'][-1]
|
|
113
101
|
else:
|
|
114
102
|
expert_output_dim = input_dim
|
|
115
|
-
|
|
103
|
+
|
|
116
104
|
# Build extraction layers (CGC layers)
|
|
117
105
|
self.shared_experts_layers = nn.ModuleList() # [num_levels]
|
|
118
106
|
self.specific_experts_layers = nn.ModuleList() # [num_levels, num_tasks]
|
|
119
|
-
self.gates_layers = (
|
|
120
|
-
|
|
121
|
-
) # [num_levels, num_tasks + 1] (+1 for shared gate)
|
|
122
|
-
|
|
107
|
+
self.gates_layers = nn.ModuleList() # [num_levels, num_tasks + 1] (+1 for shared gate)
|
|
108
|
+
|
|
123
109
|
for level in range(num_levels):
|
|
124
110
|
# Input dimension for this level
|
|
125
111
|
level_input_dim = input_dim if level == 0 else expert_output_dim
|
|
126
|
-
|
|
112
|
+
|
|
127
113
|
# Shared experts for this level
|
|
128
114
|
shared_experts = nn.ModuleList()
|
|
129
115
|
for _ in range(num_shared_experts):
|
|
130
|
-
expert = MLP(
|
|
131
|
-
input_dim=level_input_dim,
|
|
132
|
-
output_layer=False,
|
|
133
|
-
**shared_expert_params,
|
|
134
|
-
)
|
|
116
|
+
expert = MLP(input_dim=level_input_dim, output_layer=False, **shared_expert_params)
|
|
135
117
|
shared_experts.append(expert)
|
|
136
118
|
self.shared_experts_layers.append(shared_experts)
|
|
137
|
-
|
|
119
|
+
|
|
138
120
|
# Task-specific experts for this level
|
|
139
121
|
specific_experts_for_tasks = nn.ModuleList()
|
|
140
122
|
for _ in range(self.num_tasks):
|
|
141
123
|
task_experts = nn.ModuleList()
|
|
142
124
|
for _ in range(num_specific_experts):
|
|
143
|
-
expert = MLP(
|
|
144
|
-
input_dim=level_input_dim,
|
|
145
|
-
output_layer=False,
|
|
146
|
-
**specific_expert_params,
|
|
147
|
-
)
|
|
125
|
+
expert = MLP(input_dim=level_input_dim, output_layer=False, **specific_expert_params)
|
|
148
126
|
task_experts.append(expert)
|
|
149
127
|
specific_experts_for_tasks.append(task_experts)
|
|
150
128
|
self.specific_experts_layers.append(specific_experts_for_tasks)
|
|
151
|
-
|
|
129
|
+
|
|
152
130
|
# Gates for this level (num_tasks task gates + 1 shared gate)
|
|
153
131
|
gates = nn.ModuleList()
|
|
154
132
|
# Task-specific gates
|
|
@@ -156,42 +134,40 @@ class PLE(BaseModel):
|
|
|
156
134
|
for _ in range(self.num_tasks):
|
|
157
135
|
gate = nn.Sequential(
|
|
158
136
|
nn.Linear(level_input_dim, num_experts_for_task_gate),
|
|
159
|
-
nn.Softmax(dim=1)
|
|
137
|
+
nn.Softmax(dim=1)
|
|
160
138
|
)
|
|
161
139
|
gates.append(gate)
|
|
162
140
|
# Shared gate: contains all tasks' specific experts + shared experts
|
|
163
141
|
# expert counts = num_shared_experts + num_specific_experts * num_tasks
|
|
164
|
-
num_experts_for_shared_gate =
|
|
165
|
-
num_shared_experts + num_specific_experts * self.num_tasks
|
|
166
|
-
)
|
|
142
|
+
num_experts_for_shared_gate = num_shared_experts + num_specific_experts * self.num_tasks
|
|
167
143
|
shared_gate = nn.Sequential(
|
|
168
144
|
nn.Linear(level_input_dim, num_experts_for_shared_gate),
|
|
169
|
-
nn.Softmax(dim=1)
|
|
145
|
+
nn.Softmax(dim=1)
|
|
170
146
|
)
|
|
171
147
|
gates.append(shared_gate)
|
|
172
148
|
self.gates_layers.append(gates)
|
|
173
|
-
|
|
149
|
+
|
|
174
150
|
# Task-specific towers
|
|
175
151
|
self.towers = nn.ModuleList()
|
|
176
152
|
for tower_params in tower_params_list:
|
|
177
153
|
tower = MLP(input_dim=expert_output_dim, output_layer=True, **tower_params)
|
|
178
154
|
self.towers.append(tower)
|
|
179
155
|
self.prediction_layer = PredictionLayer(
|
|
180
|
-
task_type=self.task_type,
|
|
156
|
+
task_type=self.task_type,
|
|
157
|
+
task_dims=[1] * self.num_tasks
|
|
181
158
|
)
|
|
182
159
|
|
|
183
160
|
# Register regularization weights
|
|
184
161
|
self._register_regularization_weights(
|
|
185
|
-
embedding_attr=
|
|
186
|
-
include_modules=[
|
|
187
|
-
"shared_experts_layers",
|
|
188
|
-
"specific_experts_layers",
|
|
189
|
-
"gates_layers",
|
|
190
|
-
"towers",
|
|
191
|
-
],
|
|
162
|
+
embedding_attr='embedding',
|
|
163
|
+
include_modules=['shared_experts_layers', 'specific_experts_layers', 'gates_layers', 'towers']
|
|
192
164
|
)
|
|
193
165
|
|
|
194
|
-
self.compile(
|
|
166
|
+
self.compile(
|
|
167
|
+
optimizer=optimizer,
|
|
168
|
+
optimizer_params=optimizer_params,
|
|
169
|
+
loss=loss
|
|
170
|
+
)
|
|
195
171
|
|
|
196
172
|
def forward(self, x):
|
|
197
173
|
# Get all embeddings and flatten
|
|
@@ -203,17 +179,13 @@ class PLE(BaseModel):
|
|
|
203
179
|
|
|
204
180
|
# Progressive Layered Extraction: CGC
|
|
205
181
|
for level in range(self.num_levels):
|
|
206
|
-
shared_experts = self.shared_experts_layers[
|
|
207
|
-
|
|
208
|
-
]
|
|
209
|
-
specific_experts = self.specific_experts_layers[
|
|
210
|
-
level
|
|
211
|
-
] # ModuleList[num_tasks][num_specific_experts]
|
|
212
|
-
gates = self.gates_layers[level] # ModuleList[num_tasks + 1]
|
|
182
|
+
shared_experts = self.shared_experts_layers[level] # ModuleList[num_shared_experts]
|
|
183
|
+
specific_experts = self.specific_experts_layers[level] # ModuleList[num_tasks][num_specific_experts]
|
|
184
|
+
gates = self.gates_layers[level] # ModuleList[num_tasks + 1]
|
|
213
185
|
|
|
214
186
|
# Compute shared experts output for this level
|
|
215
187
|
# shared_expert_list: List[Tensor[B, expert_dim]]
|
|
216
|
-
shared_expert_list = [expert(shared_fea) for expert in shared_experts]
|
|
188
|
+
shared_expert_list = [expert(shared_fea) for expert in shared_experts] # type: ignore[list-item]
|
|
217
189
|
# [num_shared_experts, B, expert_dim]
|
|
218
190
|
shared_expert_outputs = torch.stack(shared_expert_list, dim=0)
|
|
219
191
|
|
|
@@ -226,7 +198,7 @@ class PLE(BaseModel):
|
|
|
226
198
|
current_task_in = task_fea[task_idx]
|
|
227
199
|
|
|
228
200
|
# Specific task experts for this task
|
|
229
|
-
task_expert_modules = specific_experts[task_idx]
|
|
201
|
+
task_expert_modules = specific_experts[task_idx] # type: ignore
|
|
230
202
|
|
|
231
203
|
# Specific task expert output list List[Tensor[B, expert_dim]]
|
|
232
204
|
task_specific_list = []
|
|
@@ -242,7 +214,8 @@ class PLE(BaseModel):
|
|
|
242
214
|
# Input for gate: shared_experts + own specific task experts
|
|
243
215
|
# [num_shared + num_specific, B, expert_dim]
|
|
244
216
|
all_expert_outputs = torch.cat(
|
|
245
|
-
[shared_expert_outputs, task_specific_outputs],
|
|
217
|
+
[shared_expert_outputs, task_specific_outputs],
|
|
218
|
+
dim=0
|
|
246
219
|
)
|
|
247
220
|
# [B, num_experts, expert_dim]
|
|
248
221
|
all_expert_outputs_t = all_expert_outputs.permute(1, 0, 2)
|
|
@@ -266,7 +239,7 @@ class PLE(BaseModel):
|
|
|
266
239
|
all_for_shared = torch.stack(all_for_shared_list, dim=1)
|
|
267
240
|
|
|
268
241
|
# [B, num_all_experts]
|
|
269
|
-
shared_gate_weights = gates[self.num_tasks](shared_fea)
|
|
242
|
+
shared_gate_weights = gates[self.num_tasks](shared_fea) # type: ignore
|
|
270
243
|
# [B, 1, num_all_experts]
|
|
271
244
|
shared_gate_weights = shared_gate_weights.unsqueeze(1)
|
|
272
245
|
|
|
@@ -284,4 +257,4 @@ class PLE(BaseModel):
|
|
|
284
257
|
|
|
285
258
|
# [B, num_tasks]
|
|
286
259
|
y = torch.cat(task_outputs, dim=1)
|
|
287
|
-
return self.prediction_layer(y)
|
|
260
|
+
return self.prediction_layer(y)
|
|
@@ -23,27 +23,25 @@ class ShareBottom(BaseModel):
|
|
|
23
23
|
def task_type(self):
|
|
24
24
|
# Multi-task model, return list of task types
|
|
25
25
|
return self.task if isinstance(self.task, list) else [self.task]
|
|
26
|
-
|
|
27
|
-
def __init__(
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
):
|
|
46
|
-
|
|
26
|
+
|
|
27
|
+
def __init__(self,
|
|
28
|
+
dense_features: list[DenseFeature],
|
|
29
|
+
sparse_features: list[SparseFeature],
|
|
30
|
+
sequence_features: list[SequenceFeature],
|
|
31
|
+
bottom_params: dict,
|
|
32
|
+
tower_params_list: list[dict],
|
|
33
|
+
target: list[str],
|
|
34
|
+
task: str | list[str] = 'binary',
|
|
35
|
+
optimizer: str = "adam",
|
|
36
|
+
optimizer_params: dict = {},
|
|
37
|
+
loss: str | nn.Module | list[str | nn.Module] | None = "bce",
|
|
38
|
+
device: str = 'cpu',
|
|
39
|
+
model_id: str = "baseline",
|
|
40
|
+
embedding_l1_reg=1e-6,
|
|
41
|
+
dense_l1_reg=1e-5,
|
|
42
|
+
embedding_l2_reg=1e-5,
|
|
43
|
+
dense_l2_reg=1e-4):
|
|
44
|
+
|
|
47
45
|
super(ShareBottom, self).__init__(
|
|
48
46
|
dense_features=dense_features,
|
|
49
47
|
sparse_features=sparse_features,
|
|
@@ -56,20 +54,18 @@ class ShareBottom(BaseModel):
|
|
|
56
54
|
embedding_l2_reg=embedding_l2_reg,
|
|
57
55
|
dense_l2_reg=dense_l2_reg,
|
|
58
56
|
early_stop_patience=20,
|
|
59
|
-
model_id=model_id
|
|
57
|
+
model_id=model_id
|
|
60
58
|
)
|
|
61
59
|
|
|
62
60
|
self.loss = loss
|
|
63
61
|
if self.loss is None:
|
|
64
62
|
self.loss = "bce"
|
|
65
|
-
|
|
63
|
+
|
|
66
64
|
# Number of tasks
|
|
67
65
|
self.num_tasks = len(target)
|
|
68
66
|
if len(tower_params_list) != self.num_tasks:
|
|
69
|
-
raise ValueError(
|
|
70
|
-
|
|
71
|
-
)
|
|
72
|
-
|
|
67
|
+
raise ValueError(f"Number of tower params ({len(tower_params_list)}) must match number of tasks ({self.num_tasks})")
|
|
68
|
+
|
|
73
69
|
# All features
|
|
74
70
|
self.all_features = dense_features + sparse_features + sequence_features
|
|
75
71
|
|
|
@@ -77,56 +73,54 @@ class ShareBottom(BaseModel):
|
|
|
77
73
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
78
74
|
|
|
79
75
|
# Calculate input dimension
|
|
80
|
-
emb_dim_total = sum(
|
|
81
|
-
|
|
82
|
-
f.embedding_dim
|
|
83
|
-
for f in self.all_features
|
|
84
|
-
if not isinstance(f, DenseFeature)
|
|
85
|
-
]
|
|
86
|
-
)
|
|
87
|
-
dense_input_dim = sum(
|
|
88
|
-
[getattr(f, "embedding_dim", 1) or 1 for f in dense_features]
|
|
89
|
-
)
|
|
76
|
+
emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
|
|
77
|
+
dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
|
|
90
78
|
input_dim = emb_dim_total + dense_input_dim
|
|
91
|
-
|
|
79
|
+
|
|
92
80
|
# Shared bottom network
|
|
93
81
|
self.bottom = MLP(input_dim=input_dim, output_layer=False, **bottom_params)
|
|
94
|
-
|
|
82
|
+
|
|
95
83
|
# Get bottom output dimension
|
|
96
|
-
if
|
|
97
|
-
bottom_output_dim = bottom_params[
|
|
84
|
+
if 'dims' in bottom_params and len(bottom_params['dims']) > 0:
|
|
85
|
+
bottom_output_dim = bottom_params['dims'][-1]
|
|
98
86
|
else:
|
|
99
87
|
bottom_output_dim = input_dim
|
|
100
|
-
|
|
88
|
+
|
|
101
89
|
# Task-specific towers
|
|
102
90
|
self.towers = nn.ModuleList()
|
|
103
91
|
for tower_params in tower_params_list:
|
|
104
92
|
tower = MLP(input_dim=bottom_output_dim, output_layer=True, **tower_params)
|
|
105
93
|
self.towers.append(tower)
|
|
106
94
|
self.prediction_layer = PredictionLayer(
|
|
107
|
-
task_type=self.task_type,
|
|
95
|
+
task_type=self.task_type,
|
|
96
|
+
task_dims=[1] * self.num_tasks
|
|
108
97
|
)
|
|
109
98
|
|
|
110
99
|
# Register regularization weights
|
|
111
100
|
self._register_regularization_weights(
|
|
112
|
-
embedding_attr=
|
|
101
|
+
embedding_attr='embedding',
|
|
102
|
+
include_modules=['bottom', 'towers']
|
|
113
103
|
)
|
|
114
104
|
|
|
115
|
-
self.compile(
|
|
105
|
+
self.compile(
|
|
106
|
+
optimizer=optimizer,
|
|
107
|
+
optimizer_params=optimizer_params,
|
|
108
|
+
loss=loss
|
|
109
|
+
)
|
|
116
110
|
|
|
117
111
|
def forward(self, x):
|
|
118
112
|
# Get all embeddings and flatten
|
|
119
113
|
input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
|
|
120
|
-
|
|
114
|
+
|
|
121
115
|
# Shared bottom
|
|
122
116
|
bottom_output = self.bottom(input_flat) # [B, bottom_dim]
|
|
123
|
-
|
|
117
|
+
|
|
124
118
|
# Task-specific towers
|
|
125
119
|
task_outputs = []
|
|
126
120
|
for tower in self.towers:
|
|
127
121
|
tower_output = tower(bottom_output) # [B, 1]
|
|
128
122
|
task_outputs.append(tower_output)
|
|
129
|
-
|
|
123
|
+
|
|
130
124
|
# Stack outputs: [B, num_tasks]
|
|
131
125
|
y = torch.cat(task_outputs, dim=1)
|
|
132
126
|
return self.prediction_layer(y)
|
nextrec/models/ranking/afm.py
CHANGED
|
@@ -23,26 +23,24 @@ class AFM(BaseModel):
|
|
|
23
23
|
@property
|
|
24
24
|
def task_type(self):
|
|
25
25
|
return "binary"
|
|
26
|
-
|
|
27
|
-
def __init__(
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
):
|
|
45
|
-
|
|
26
|
+
|
|
27
|
+
def __init__(self,
|
|
28
|
+
dense_features: list[DenseFeature] | list = [],
|
|
29
|
+
sparse_features: list[SparseFeature] | list = [],
|
|
30
|
+
sequence_features: list[SequenceFeature] | list = [],
|
|
31
|
+
attention_dim: int = 32,
|
|
32
|
+
attention_dropout: float = 0.0,
|
|
33
|
+
target: list[str] | list = [],
|
|
34
|
+
optimizer: str = "adam",
|
|
35
|
+
optimizer_params: dict = {},
|
|
36
|
+
loss: str | nn.Module | None = "bce",
|
|
37
|
+
device: str = 'cpu',
|
|
38
|
+
model_id: str = "baseline",
|
|
39
|
+
embedding_l1_reg=1e-6,
|
|
40
|
+
dense_l1_reg=1e-5,
|
|
41
|
+
embedding_l2_reg=1e-5,
|
|
42
|
+
dense_l2_reg=1e-4):
|
|
43
|
+
|
|
46
44
|
super(AFM, self).__init__(
|
|
47
45
|
dense_features=dense_features,
|
|
48
46
|
sparse_features=sparse_features,
|
|
@@ -55,25 +53,21 @@ class AFM(BaseModel):
|
|
|
55
53
|
embedding_l2_reg=embedding_l2_reg,
|
|
56
54
|
dense_l2_reg=dense_l2_reg,
|
|
57
55
|
early_stop_patience=20,
|
|
58
|
-
model_id=model_id
|
|
56
|
+
model_id=model_id
|
|
59
57
|
)
|
|
60
58
|
|
|
61
59
|
self.loss = loss
|
|
62
60
|
if self.loss is None:
|
|
63
61
|
self.loss = "bce"
|
|
64
|
-
|
|
62
|
+
|
|
65
63
|
self.fm_features = sparse_features + sequence_features
|
|
66
64
|
if len(self.fm_features) < 2:
|
|
67
|
-
raise ValueError(
|
|
68
|
-
"AFM requires at least two sparse/sequence features to build pairwise interactions."
|
|
69
|
-
)
|
|
65
|
+
raise ValueError("AFM requires at least two sparse/sequence features to build pairwise interactions.")
|
|
70
66
|
|
|
71
67
|
# Assume uniform embedding dimension across FM fields
|
|
72
68
|
self.embedding_dim = self.fm_features[0].embedding_dim
|
|
73
69
|
if any(f.embedding_dim != self.embedding_dim for f in self.fm_features):
|
|
74
|
-
raise ValueError(
|
|
75
|
-
"All FM features must share the same embedding_dim for AFM."
|
|
76
|
-
)
|
|
70
|
+
raise ValueError("All FM features must share the same embedding_dim for AFM.")
|
|
77
71
|
|
|
78
72
|
self.embedding = EmbeddingLayer(features=self.fm_features)
|
|
79
73
|
|
|
@@ -88,21 +82,18 @@ class AFM(BaseModel):
|
|
|
88
82
|
|
|
89
83
|
# Register regularization weights
|
|
90
84
|
self._register_regularization_weights(
|
|
91
|
-
embedding_attr=
|
|
92
|
-
include_modules=[
|
|
93
|
-
"linear",
|
|
94
|
-
"attention_linear",
|
|
95
|
-
"attention_p",
|
|
96
|
-
"output_projection",
|
|
97
|
-
],
|
|
85
|
+
embedding_attr='embedding',
|
|
86
|
+
include_modules=['linear', 'attention_linear', 'attention_p', 'output_projection']
|
|
98
87
|
)
|
|
99
88
|
|
|
100
|
-
self.compile(
|
|
89
|
+
self.compile(
|
|
90
|
+
optimizer=optimizer,
|
|
91
|
+
optimizer_params=optimizer_params,
|
|
92
|
+
loss=loss
|
|
93
|
+
)
|
|
101
94
|
|
|
102
95
|
def forward(self, x):
|
|
103
|
-
field_emb = self.embedding(
|
|
104
|
-
x=x, features=self.fm_features, squeeze_dim=False
|
|
105
|
-
) # [B, F, D]
|
|
96
|
+
field_emb = self.embedding(x=x, features=self.fm_features, squeeze_dim=False) # [B, F, D]
|
|
106
97
|
input_linear = field_emb.flatten(start_dim=1)
|
|
107
98
|
y_linear = self.linear(input_linear)
|
|
108
99
|
|