nextrec 0.1.4__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +4 -4
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +9 -10
- nextrec/basic/callback.py +0 -1
- nextrec/basic/dataloader.py +127 -168
- nextrec/basic/features.py +27 -24
- nextrec/basic/layers.py +159 -328
- nextrec/basic/loggers.py +37 -50
- nextrec/basic/metrics.py +147 -255
- nextrec/basic/model.py +462 -817
- nextrec/data/__init__.py +5 -5
- nextrec/data/data_utils.py +12 -16
- nextrec/data/preprocessor.py +252 -276
- nextrec/loss/__init__.py +12 -12
- nextrec/loss/loss_utils.py +22 -30
- nextrec/loss/match_losses.py +83 -116
- nextrec/models/match/__init__.py +5 -5
- nextrec/models/match/dssm.py +61 -70
- nextrec/models/match/dssm_v2.py +51 -61
- nextrec/models/match/mind.py +71 -89
- nextrec/models/match/sdm.py +81 -93
- nextrec/models/match/youtube_dnn.py +53 -62
- nextrec/models/multi_task/esmm.py +43 -49
- nextrec/models/multi_task/mmoe.py +56 -65
- nextrec/models/multi_task/ple.py +65 -92
- nextrec/models/multi_task/share_bottom.py +42 -48
- nextrec/models/ranking/__init__.py +7 -7
- nextrec/models/ranking/afm.py +30 -39
- nextrec/models/ranking/autoint.py +57 -70
- nextrec/models/ranking/dcn.py +35 -43
- nextrec/models/ranking/deepfm.py +28 -34
- nextrec/models/ranking/dien.py +79 -115
- nextrec/models/ranking/din.py +60 -84
- nextrec/models/ranking/fibinet.py +35 -51
- nextrec/models/ranking/fm.py +26 -28
- nextrec/models/ranking/masknet.py +31 -31
- nextrec/models/ranking/pnn.py +31 -30
- nextrec/models/ranking/widedeep.py +31 -36
- nextrec/models/ranking/xdeepfm.py +39 -46
- nextrec/utils/__init__.py +9 -9
- nextrec/utils/embedding.py +1 -1
- nextrec/utils/initializer.py +15 -23
- nextrec/utils/optimizer.py +10 -14
- {nextrec-0.1.4.dist-info → nextrec-0.1.7.dist-info}/METADATA +16 -7
- nextrec-0.1.7.dist-info/RECORD +51 -0
- nextrec-0.1.4.dist-info/RECORD +0 -51
- {nextrec-0.1.4.dist-info → nextrec-0.1.7.dist-info}/WHEEL +0 -0
- {nextrec-0.1.4.dist-info → nextrec-0.1.7.dist-info}/licenses/LICENSE +0 -0
nextrec/models/ranking/dien.py
CHANGED
|
@@ -12,14 +12,7 @@ import torch
|
|
|
12
12
|
import torch.nn as nn
|
|
13
13
|
|
|
14
14
|
from nextrec.basic.model import BaseModel
|
|
15
|
-
from nextrec.basic.layers import
|
|
16
|
-
EmbeddingLayer,
|
|
17
|
-
MLP,
|
|
18
|
-
AttentionPoolingLayer,
|
|
19
|
-
DynamicGRU,
|
|
20
|
-
AUGRU,
|
|
21
|
-
PredictionLayer,
|
|
22
|
-
)
|
|
15
|
+
from nextrec.basic.layers import EmbeddingLayer, MLP, AttentionPoolingLayer, DynamicGRU, AUGRU, PredictionLayer
|
|
23
16
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
24
17
|
|
|
25
18
|
|
|
@@ -31,29 +24,27 @@ class DIEN(BaseModel):
|
|
|
31
24
|
@property
|
|
32
25
|
def task_type(self):
|
|
33
26
|
return "binary"
|
|
34
|
-
|
|
35
|
-
def __init__(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
):
|
|
56
|
-
|
|
27
|
+
|
|
28
|
+
def __init__(self,
|
|
29
|
+
dense_features: list[DenseFeature],
|
|
30
|
+
sparse_features: list[SparseFeature],
|
|
31
|
+
sequence_features: list[SequenceFeature],
|
|
32
|
+
mlp_params: dict,
|
|
33
|
+
gru_hidden_size: int = 64,
|
|
34
|
+
attention_hidden_units: list[int] = [80, 40],
|
|
35
|
+
attention_activation: str = 'sigmoid',
|
|
36
|
+
use_negsampling: bool = False,
|
|
37
|
+
target: list[str] = [],
|
|
38
|
+
optimizer: str = "adam",
|
|
39
|
+
optimizer_params: dict = {},
|
|
40
|
+
loss: str | nn.Module | None = "bce",
|
|
41
|
+
device: str = 'cpu',
|
|
42
|
+
model_id: str = "baseline",
|
|
43
|
+
embedding_l1_reg=1e-6,
|
|
44
|
+
dense_l1_reg=1e-5,
|
|
45
|
+
embedding_l2_reg=1e-5,
|
|
46
|
+
dense_l2_reg=1e-4):
|
|
47
|
+
|
|
57
48
|
super(DIEN, self).__init__(
|
|
58
49
|
dense_features=dense_features,
|
|
59
50
|
sparse_features=sparse_features,
|
|
@@ -66,185 +57,158 @@ class DIEN(BaseModel):
|
|
|
66
57
|
embedding_l2_reg=embedding_l2_reg,
|
|
67
58
|
dense_l2_reg=dense_l2_reg,
|
|
68
59
|
early_stop_patience=20,
|
|
69
|
-
model_id=model_id
|
|
60
|
+
model_id=model_id
|
|
70
61
|
)
|
|
71
62
|
|
|
72
63
|
self.loss = loss
|
|
73
64
|
if self.loss is None:
|
|
74
65
|
self.loss = "bce"
|
|
75
|
-
|
|
66
|
+
|
|
76
67
|
self.use_negsampling = use_negsampling
|
|
77
|
-
|
|
68
|
+
|
|
78
69
|
# Features classification
|
|
79
70
|
if len(sequence_features) == 0:
|
|
80
|
-
raise ValueError(
|
|
81
|
-
|
|
82
|
-
)
|
|
83
|
-
|
|
71
|
+
raise ValueError("DIEN requires at least one sequence feature for user behavior history")
|
|
72
|
+
|
|
84
73
|
self.behavior_feature = sequence_features[0] # User behavior sequence
|
|
85
|
-
self.candidate_feature =
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
self.other_sparse_features = (
|
|
90
|
-
sparse_features[:-1] if self.candidate_feature else sparse_features
|
|
91
|
-
)
|
|
74
|
+
self.candidate_feature = sparse_features[-1] if sparse_features else None # Candidate item
|
|
75
|
+
|
|
76
|
+
self.other_sparse_features = sparse_features[:-1] if self.candidate_feature else sparse_features
|
|
92
77
|
self.dense_features_list = dense_features
|
|
93
|
-
|
|
78
|
+
|
|
94
79
|
# All features for embedding
|
|
95
80
|
self.all_features = dense_features + sparse_features + sequence_features
|
|
96
81
|
|
|
97
82
|
# Embedding layer
|
|
98
83
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
99
|
-
|
|
84
|
+
|
|
100
85
|
behavior_emb_dim = self.behavior_feature.embedding_dim
|
|
101
86
|
self.candidate_proj = None
|
|
102
|
-
if
|
|
103
|
-
self.
|
|
104
|
-
|
|
105
|
-
):
|
|
106
|
-
self.candidate_proj = nn.Linear(
|
|
107
|
-
self.candidate_feature.embedding_dim, gru_hidden_size
|
|
108
|
-
)
|
|
109
|
-
|
|
87
|
+
if self.candidate_feature is not None and self.candidate_feature.embedding_dim != gru_hidden_size:
|
|
88
|
+
self.candidate_proj = nn.Linear(self.candidate_feature.embedding_dim, gru_hidden_size)
|
|
89
|
+
|
|
110
90
|
# Interest Extractor Layer (GRU)
|
|
111
91
|
self.interest_extractor = DynamicGRU(
|
|
112
|
-
input_size=behavior_emb_dim,
|
|
92
|
+
input_size=behavior_emb_dim,
|
|
93
|
+
hidden_size=gru_hidden_size
|
|
113
94
|
)
|
|
114
|
-
|
|
95
|
+
|
|
115
96
|
# Attention layer for computing attention scores
|
|
116
97
|
self.attention_layer = AttentionPoolingLayer(
|
|
117
98
|
embedding_dim=gru_hidden_size,
|
|
118
99
|
hidden_units=attention_hidden_units,
|
|
119
100
|
activation=attention_activation,
|
|
120
|
-
use_softmax=False
|
|
101
|
+
use_softmax=False # We'll use scores directly for AUGRU
|
|
121
102
|
)
|
|
122
|
-
|
|
103
|
+
|
|
123
104
|
# Interest Evolution Layer (AUGRU)
|
|
124
105
|
self.interest_evolution = AUGRU(
|
|
125
|
-
input_size=gru_hidden_size,
|
|
106
|
+
input_size=gru_hidden_size,
|
|
107
|
+
hidden_size=gru_hidden_size
|
|
126
108
|
)
|
|
127
|
-
|
|
109
|
+
|
|
128
110
|
# Calculate MLP input dimension
|
|
129
111
|
mlp_input_dim = 0
|
|
130
112
|
if self.candidate_feature:
|
|
131
113
|
mlp_input_dim += self.candidate_feature.embedding_dim
|
|
132
114
|
mlp_input_dim += gru_hidden_size # final interest state
|
|
133
115
|
mlp_input_dim += sum([f.embedding_dim for f in self.other_sparse_features])
|
|
134
|
-
mlp_input_dim += sum(
|
|
135
|
-
|
|
136
|
-
)
|
|
137
|
-
|
|
116
|
+
mlp_input_dim += sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
|
|
117
|
+
|
|
138
118
|
# MLP for final prediction
|
|
139
119
|
self.mlp = MLP(input_dim=mlp_input_dim, **mlp_params)
|
|
140
120
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
141
121
|
|
|
142
122
|
# Register regularization weights
|
|
143
123
|
self._register_regularization_weights(
|
|
144
|
-
embedding_attr=
|
|
145
|
-
include_modules=[
|
|
146
|
-
"interest_extractor",
|
|
147
|
-
"interest_evolution",
|
|
148
|
-
"attention_layer",
|
|
149
|
-
"mlp",
|
|
150
|
-
"candidate_proj",
|
|
151
|
-
],
|
|
124
|
+
embedding_attr='embedding',
|
|
125
|
+
include_modules=['interest_extractor', 'interest_evolution', 'attention_layer', 'mlp', 'candidate_proj']
|
|
152
126
|
)
|
|
153
127
|
|
|
154
|
-
self.compile(
|
|
128
|
+
self.compile(
|
|
129
|
+
optimizer=optimizer,
|
|
130
|
+
optimizer_params=optimizer_params,
|
|
131
|
+
loss=loss
|
|
132
|
+
)
|
|
155
133
|
|
|
156
134
|
def forward(self, x):
|
|
157
135
|
# Get candidate item embedding
|
|
158
136
|
if self.candidate_feature:
|
|
159
|
-
candidate_emb = self.embedding.embed_dict[
|
|
160
|
-
self.candidate_feature.embedding_name
|
|
161
|
-
](
|
|
137
|
+
candidate_emb = self.embedding.embed_dict[self.candidate_feature.embedding_name](
|
|
162
138
|
x[self.candidate_feature.name].long()
|
|
163
139
|
) # [B, emb_dim]
|
|
164
140
|
else:
|
|
165
141
|
raise ValueError("DIEN requires a candidate item feature")
|
|
166
|
-
|
|
142
|
+
|
|
167
143
|
# Get behavior sequence embedding
|
|
168
144
|
behavior_seq = x[self.behavior_feature.name].long() # [B, seq_len]
|
|
169
145
|
behavior_emb = self.embedding.embed_dict[self.behavior_feature.embedding_name](
|
|
170
146
|
behavior_seq
|
|
171
147
|
) # [B, seq_len, emb_dim]
|
|
172
|
-
|
|
148
|
+
|
|
173
149
|
# Create mask for padding
|
|
174
150
|
if self.behavior_feature.padding_idx is not None:
|
|
175
|
-
mask = (
|
|
176
|
-
(behavior_seq != self.behavior_feature.padding_idx)
|
|
177
|
-
.unsqueeze(-1)
|
|
178
|
-
.float()
|
|
179
|
-
)
|
|
151
|
+
mask = (behavior_seq != self.behavior_feature.padding_idx).unsqueeze(-1).float()
|
|
180
152
|
else:
|
|
181
153
|
mask = (behavior_seq != 0).unsqueeze(-1).float()
|
|
182
|
-
|
|
154
|
+
|
|
183
155
|
# Step 1: Interest Extractor (GRU)
|
|
184
|
-
interest_states, _ = self.interest_extractor(
|
|
185
|
-
|
|
186
|
-
) # [B, seq_len, hidden_size]
|
|
187
|
-
|
|
156
|
+
interest_states, _ = self.interest_extractor(behavior_emb) # [B, seq_len, hidden_size]
|
|
157
|
+
|
|
188
158
|
# Step 2: Compute attention scores for each time step
|
|
189
159
|
batch_size, seq_len, hidden_size = interest_states.shape
|
|
190
|
-
|
|
160
|
+
|
|
191
161
|
# Project candidate to hidden_size if necessary (defined in __init__)
|
|
192
162
|
if self.candidate_proj is not None:
|
|
193
163
|
candidate_for_attention = self.candidate_proj(candidate_emb)
|
|
194
164
|
else:
|
|
195
165
|
candidate_for_attention = candidate_emb
|
|
196
|
-
|
|
166
|
+
|
|
197
167
|
# Compute attention scores for AUGRU
|
|
198
168
|
attention_scores = []
|
|
199
169
|
for t in range(seq_len):
|
|
200
170
|
score = self.attention_layer.attention_net(
|
|
201
|
-
torch.cat(
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
],
|
|
208
|
-
dim=-1,
|
|
209
|
-
)
|
|
171
|
+
torch.cat([
|
|
172
|
+
candidate_for_attention,
|
|
173
|
+
interest_states[:, t, :],
|
|
174
|
+
candidate_for_attention - interest_states[:, t, :],
|
|
175
|
+
candidate_for_attention * interest_states[:, t, :]
|
|
176
|
+
], dim=-1)
|
|
210
177
|
) # [B, 1]
|
|
211
178
|
attention_scores.append(score)
|
|
212
|
-
|
|
213
|
-
attention_scores = torch.cat(attention_scores, dim=1).unsqueeze(
|
|
214
|
-
-1
|
|
215
|
-
) # [B, seq_len, 1]
|
|
179
|
+
|
|
180
|
+
attention_scores = torch.cat(attention_scores, dim=1).unsqueeze(-1) # [B, seq_len, 1]
|
|
216
181
|
attention_scores = torch.sigmoid(attention_scores) # Normalize to [0, 1]
|
|
217
|
-
|
|
182
|
+
|
|
218
183
|
# Apply mask to attention scores
|
|
219
184
|
attention_scores = attention_scores * mask
|
|
220
|
-
|
|
185
|
+
|
|
221
186
|
# Step 3: Interest Evolution (AUGRU)
|
|
222
187
|
final_states, final_interest = self.interest_evolution(
|
|
223
|
-
interest_states,
|
|
188
|
+
interest_states,
|
|
189
|
+
attention_scores
|
|
224
190
|
) # final_interest: [B, hidden_size]
|
|
225
|
-
|
|
191
|
+
|
|
226
192
|
# Get other features
|
|
227
193
|
other_embeddings = []
|
|
228
194
|
other_embeddings.append(candidate_emb)
|
|
229
195
|
other_embeddings.append(final_interest)
|
|
230
|
-
|
|
196
|
+
|
|
231
197
|
# Other sparse features
|
|
232
198
|
for feat in self.other_sparse_features:
|
|
233
|
-
feat_emb = self.embedding.embed_dict[feat.embedding_name](
|
|
234
|
-
x[feat.name].long()
|
|
235
|
-
)
|
|
199
|
+
feat_emb = self.embedding.embed_dict[feat.embedding_name](x[feat.name].long())
|
|
236
200
|
other_embeddings.append(feat_emb)
|
|
237
|
-
|
|
201
|
+
|
|
238
202
|
# Dense features
|
|
239
203
|
for feat in self.dense_features_list:
|
|
240
204
|
val = x[feat.name].float()
|
|
241
205
|
if val.dim() == 1:
|
|
242
206
|
val = val.unsqueeze(1)
|
|
243
207
|
other_embeddings.append(val)
|
|
244
|
-
|
|
208
|
+
|
|
245
209
|
# Concatenate all features
|
|
246
210
|
concat_input = torch.cat(other_embeddings, dim=-1) # [B, total_dim]
|
|
247
|
-
|
|
211
|
+
|
|
248
212
|
# MLP prediction
|
|
249
213
|
y = self.mlp(concat_input) # [B, 1]
|
|
250
214
|
return self.prediction_layer(y)
|
nextrec/models/ranking/din.py
CHANGED
|
@@ -12,12 +12,7 @@ import torch
|
|
|
12
12
|
import torch.nn as nn
|
|
13
13
|
|
|
14
14
|
from nextrec.basic.model import BaseModel
|
|
15
|
-
from nextrec.basic.layers import
|
|
16
|
-
EmbeddingLayer,
|
|
17
|
-
MLP,
|
|
18
|
-
AttentionPoolingLayer,
|
|
19
|
-
PredictionLayer,
|
|
20
|
-
)
|
|
15
|
+
from nextrec.basic.layers import EmbeddingLayer, MLP, AttentionPoolingLayer, PredictionLayer
|
|
21
16
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
22
17
|
|
|
23
18
|
|
|
@@ -29,28 +24,26 @@ class DIN(BaseModel):
|
|
|
29
24
|
@property
|
|
30
25
|
def task_type(self):
|
|
31
26
|
return "binary"
|
|
32
|
-
|
|
33
|
-
def __init__(
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
):
|
|
53
|
-
|
|
27
|
+
|
|
28
|
+
def __init__(self,
|
|
29
|
+
dense_features: list[DenseFeature],
|
|
30
|
+
sparse_features: list[SparseFeature],
|
|
31
|
+
sequence_features: list[SequenceFeature],
|
|
32
|
+
mlp_params: dict,
|
|
33
|
+
attention_hidden_units: list[int] = [80, 40],
|
|
34
|
+
attention_activation: str = 'sigmoid',
|
|
35
|
+
attention_use_softmax: bool = True,
|
|
36
|
+
target: list[str] = [],
|
|
37
|
+
optimizer: str = "adam",
|
|
38
|
+
optimizer_params: dict = {},
|
|
39
|
+
loss: str | nn.Module | None = "bce",
|
|
40
|
+
device: str = 'cpu',
|
|
41
|
+
model_id: str = "baseline",
|
|
42
|
+
embedding_l1_reg=1e-6,
|
|
43
|
+
dense_l1_reg=1e-5,
|
|
44
|
+
embedding_l2_reg=1e-5,
|
|
45
|
+
dense_l2_reg=1e-4):
|
|
46
|
+
|
|
54
47
|
super(DIN, self).__init__(
|
|
55
48
|
dense_features=dense_features,
|
|
56
49
|
sparse_features=sparse_features,
|
|
@@ -63,54 +56,43 @@ class DIN(BaseModel):
|
|
|
63
56
|
embedding_l2_reg=embedding_l2_reg,
|
|
64
57
|
dense_l2_reg=dense_l2_reg,
|
|
65
58
|
early_stop_patience=20,
|
|
66
|
-
model_id=model_id
|
|
59
|
+
model_id=model_id
|
|
67
60
|
)
|
|
68
61
|
|
|
69
62
|
self.loss = loss
|
|
70
63
|
if self.loss is None:
|
|
71
64
|
self.loss = "bce"
|
|
72
|
-
|
|
65
|
+
|
|
73
66
|
# Features classification
|
|
74
67
|
# DIN requires: candidate item + user behavior sequence + other features
|
|
75
68
|
if len(sequence_features) == 0:
|
|
76
|
-
raise ValueError(
|
|
77
|
-
|
|
78
|
-
)
|
|
79
|
-
|
|
69
|
+
raise ValueError("DIN requires at least one sequence feature for user behavior history")
|
|
70
|
+
|
|
80
71
|
self.behavior_feature = sequence_features[0] # User behavior sequence
|
|
81
|
-
self.candidate_feature =
|
|
82
|
-
|
|
83
|
-
) # Candidate item
|
|
84
|
-
|
|
72
|
+
self.candidate_feature = sparse_features[-1] if sparse_features else None # Candidate item
|
|
73
|
+
|
|
85
74
|
# Other features (excluding behavior sequence in final concatenation)
|
|
86
|
-
self.other_sparse_features =
|
|
87
|
-
sparse_features[:-1] if self.candidate_feature else sparse_features
|
|
88
|
-
)
|
|
75
|
+
self.other_sparse_features = sparse_features[:-1] if self.candidate_feature else sparse_features
|
|
89
76
|
self.dense_features_list = dense_features
|
|
90
|
-
|
|
77
|
+
|
|
91
78
|
# All features for embedding
|
|
92
79
|
self.all_features = dense_features + sparse_features + sequence_features
|
|
93
80
|
|
|
94
81
|
# Embedding layer
|
|
95
82
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
96
|
-
|
|
83
|
+
|
|
97
84
|
# Attention layer for behavior sequence
|
|
98
85
|
behavior_emb_dim = self.behavior_feature.embedding_dim
|
|
99
86
|
self.candidate_attention_proj = None
|
|
100
|
-
if
|
|
101
|
-
self.
|
|
102
|
-
and self.candidate_feature.embedding_dim != behavior_emb_dim
|
|
103
|
-
):
|
|
104
|
-
self.candidate_attention_proj = nn.Linear(
|
|
105
|
-
self.candidate_feature.embedding_dim, behavior_emb_dim
|
|
106
|
-
)
|
|
87
|
+
if self.candidate_feature is not None and self.candidate_feature.embedding_dim != behavior_emb_dim:
|
|
88
|
+
self.candidate_attention_proj = nn.Linear(self.candidate_feature.embedding_dim, behavior_emb_dim)
|
|
107
89
|
self.attention = AttentionPoolingLayer(
|
|
108
90
|
embedding_dim=behavior_emb_dim,
|
|
109
91
|
hidden_units=attention_hidden_units,
|
|
110
92
|
activation=attention_activation,
|
|
111
|
-
use_softmax=attention_use_softmax
|
|
93
|
+
use_softmax=attention_use_softmax
|
|
112
94
|
)
|
|
113
|
-
|
|
95
|
+
|
|
114
96
|
# Calculate MLP input dimension
|
|
115
97
|
# candidate + attention_pooled_behavior + other_sparse + dense
|
|
116
98
|
mlp_input_dim = 0
|
|
@@ -118,88 +100,82 @@ class DIN(BaseModel):
|
|
|
118
100
|
mlp_input_dim += self.candidate_feature.embedding_dim
|
|
119
101
|
mlp_input_dim += behavior_emb_dim # attention pooled
|
|
120
102
|
mlp_input_dim += sum([f.embedding_dim for f in self.other_sparse_features])
|
|
121
|
-
mlp_input_dim += sum(
|
|
122
|
-
|
|
123
|
-
)
|
|
124
|
-
|
|
103
|
+
mlp_input_dim += sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
|
|
104
|
+
|
|
125
105
|
# MLP for final prediction
|
|
126
106
|
self.mlp = MLP(input_dim=mlp_input_dim, **mlp_params)
|
|
127
107
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
128
108
|
|
|
129
109
|
# Register regularization weights
|
|
130
110
|
self._register_regularization_weights(
|
|
131
|
-
embedding_attr=
|
|
132
|
-
include_modules=[
|
|
111
|
+
embedding_attr='embedding',
|
|
112
|
+
include_modules=['attention', 'mlp', 'candidate_attention_proj']
|
|
133
113
|
)
|
|
134
114
|
|
|
135
|
-
self.compile(
|
|
115
|
+
self.compile(
|
|
116
|
+
optimizer=optimizer,
|
|
117
|
+
optimizer_params=optimizer_params,
|
|
118
|
+
loss=loss
|
|
119
|
+
)
|
|
136
120
|
|
|
137
121
|
def forward(self, x):
|
|
138
122
|
# Get candidate item embedding
|
|
139
123
|
if self.candidate_feature:
|
|
140
|
-
candidate_emb = self.embedding.embed_dict[
|
|
141
|
-
self.candidate_feature.embedding_name
|
|
142
|
-
](
|
|
124
|
+
candidate_emb = self.embedding.embed_dict[self.candidate_feature.embedding_name](
|
|
143
125
|
x[self.candidate_feature.name].long()
|
|
144
126
|
) # [B, emb_dim]
|
|
145
127
|
else:
|
|
146
128
|
candidate_emb = None
|
|
147
|
-
|
|
129
|
+
|
|
148
130
|
# Get behavior sequence embedding
|
|
149
131
|
behavior_seq = x[self.behavior_feature.name].long() # [B, seq_len]
|
|
150
132
|
behavior_emb = self.embedding.embed_dict[self.behavior_feature.embedding_name](
|
|
151
133
|
behavior_seq
|
|
152
134
|
) # [B, seq_len, emb_dim]
|
|
153
|
-
|
|
135
|
+
|
|
154
136
|
# Create mask for padding
|
|
155
137
|
if self.behavior_feature.padding_idx is not None:
|
|
156
|
-
mask = (
|
|
157
|
-
(behavior_seq != self.behavior_feature.padding_idx)
|
|
158
|
-
.unsqueeze(-1)
|
|
159
|
-
.float()
|
|
160
|
-
)
|
|
138
|
+
mask = (behavior_seq != self.behavior_feature.padding_idx).unsqueeze(-1).float()
|
|
161
139
|
else:
|
|
162
140
|
mask = (behavior_seq != 0).unsqueeze(-1).float()
|
|
163
|
-
|
|
141
|
+
|
|
164
142
|
# Apply attention pooling
|
|
165
143
|
if candidate_emb is not None:
|
|
166
144
|
candidate_query = candidate_emb
|
|
167
145
|
if self.candidate_attention_proj is not None:
|
|
168
146
|
candidate_query = self.candidate_attention_proj(candidate_query)
|
|
169
147
|
pooled_behavior = self.attention(
|
|
170
|
-
query=candidate_query,
|
|
148
|
+
query=candidate_query,
|
|
149
|
+
keys=behavior_emb,
|
|
150
|
+
mask=mask
|
|
171
151
|
) # [B, emb_dim]
|
|
172
152
|
else:
|
|
173
153
|
# If no candidate, use mean pooling
|
|
174
|
-
pooled_behavior = torch.sum(behavior_emb * mask, dim=1) / (
|
|
175
|
-
|
|
176
|
-
)
|
|
177
|
-
|
|
154
|
+
pooled_behavior = torch.sum(behavior_emb * mask, dim=1) / (mask.sum(dim=1) + 1e-9)
|
|
155
|
+
|
|
178
156
|
# Get other features
|
|
179
157
|
other_embeddings = []
|
|
180
|
-
|
|
158
|
+
|
|
181
159
|
if candidate_emb is not None:
|
|
182
160
|
other_embeddings.append(candidate_emb)
|
|
183
|
-
|
|
161
|
+
|
|
184
162
|
other_embeddings.append(pooled_behavior)
|
|
185
|
-
|
|
163
|
+
|
|
186
164
|
# Other sparse features
|
|
187
165
|
for feat in self.other_sparse_features:
|
|
188
|
-
feat_emb = self.embedding.embed_dict[feat.embedding_name](
|
|
189
|
-
x[feat.name].long()
|
|
190
|
-
)
|
|
166
|
+
feat_emb = self.embedding.embed_dict[feat.embedding_name](x[feat.name].long())
|
|
191
167
|
other_embeddings.append(feat_emb)
|
|
192
|
-
|
|
168
|
+
|
|
193
169
|
# Dense features
|
|
194
170
|
for feat in self.dense_features_list:
|
|
195
171
|
val = x[feat.name].float()
|
|
196
172
|
if val.dim() == 1:
|
|
197
173
|
val = val.unsqueeze(1)
|
|
198
174
|
other_embeddings.append(val)
|
|
199
|
-
|
|
175
|
+
|
|
200
176
|
# Concatenate all features
|
|
201
177
|
concat_input = torch.cat(other_embeddings, dim=-1) # [B, total_dim]
|
|
202
|
-
|
|
178
|
+
|
|
203
179
|
# MLP prediction
|
|
204
180
|
y = self.mlp(concat_input) # [B, 1]
|
|
205
181
|
return self.prediction_layer(y)
|