nextrec 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +4 -4
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +10 -9
- nextrec/basic/callback.py +1 -0
- nextrec/basic/dataloader.py +168 -127
- nextrec/basic/features.py +24 -27
- nextrec/basic/layers.py +328 -159
- nextrec/basic/loggers.py +50 -37
- nextrec/basic/metrics.py +255 -147
- nextrec/basic/model.py +817 -462
- nextrec/data/__init__.py +5 -5
- nextrec/data/data_utils.py +16 -12
- nextrec/data/preprocessor.py +276 -252
- nextrec/loss/__init__.py +12 -12
- nextrec/loss/loss_utils.py +30 -22
- nextrec/loss/match_losses.py +116 -83
- nextrec/models/match/__init__.py +5 -5
- nextrec/models/match/dssm.py +70 -61
- nextrec/models/match/dssm_v2.py +61 -51
- nextrec/models/match/mind.py +89 -71
- nextrec/models/match/sdm.py +93 -81
- nextrec/models/match/youtube_dnn.py +62 -53
- nextrec/models/multi_task/esmm.py +49 -43
- nextrec/models/multi_task/mmoe.py +65 -56
- nextrec/models/multi_task/ple.py +92 -65
- nextrec/models/multi_task/share_bottom.py +48 -42
- nextrec/models/ranking/__init__.py +7 -7
- nextrec/models/ranking/afm.py +39 -30
- nextrec/models/ranking/autoint.py +70 -57
- nextrec/models/ranking/dcn.py +43 -35
- nextrec/models/ranking/deepfm.py +34 -28
- nextrec/models/ranking/dien.py +115 -79
- nextrec/models/ranking/din.py +84 -60
- nextrec/models/ranking/fibinet.py +51 -35
- nextrec/models/ranking/fm.py +28 -26
- nextrec/models/ranking/masknet.py +31 -31
- nextrec/models/ranking/pnn.py +30 -31
- nextrec/models/ranking/widedeep.py +36 -31
- nextrec/models/ranking/xdeepfm.py +46 -39
- nextrec/utils/__init__.py +9 -9
- nextrec/utils/embedding.py +1 -1
- nextrec/utils/initializer.py +23 -15
- nextrec/utils/optimizer.py +14 -10
- {nextrec-0.1.1.dist-info → nextrec-0.1.2.dist-info}/METADATA +6 -40
- nextrec-0.1.2.dist-info/RECORD +51 -0
- nextrec-0.1.1.dist-info/RECORD +0 -51
- {nextrec-0.1.1.dist-info → nextrec-0.1.2.dist-info}/WHEEL +0 -0
- {nextrec-0.1.1.dist-info → nextrec-0.1.2.dist-info}/licenses/LICENSE +0 -0
nextrec/models/ranking/dien.py
CHANGED
|
@@ -12,7 +12,14 @@ import torch
|
|
|
12
12
|
import torch.nn as nn
|
|
13
13
|
|
|
14
14
|
from nextrec.basic.model import BaseModel
|
|
15
|
-
from nextrec.basic.layers import
|
|
15
|
+
from nextrec.basic.layers import (
|
|
16
|
+
EmbeddingLayer,
|
|
17
|
+
MLP,
|
|
18
|
+
AttentionPoolingLayer,
|
|
19
|
+
DynamicGRU,
|
|
20
|
+
AUGRU,
|
|
21
|
+
PredictionLayer,
|
|
22
|
+
)
|
|
16
23
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
17
24
|
|
|
18
25
|
|
|
@@ -24,27 +31,29 @@ class DIEN(BaseModel):
|
|
|
24
31
|
@property
|
|
25
32
|
def task_type(self):
|
|
26
33
|
return "binary"
|
|
27
|
-
|
|
28
|
-
def __init__(
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
dense_features: list[DenseFeature],
|
|
38
|
+
sparse_features: list[SparseFeature],
|
|
39
|
+
sequence_features: list[SequenceFeature],
|
|
40
|
+
mlp_params: dict,
|
|
41
|
+
gru_hidden_size: int = 64,
|
|
42
|
+
attention_hidden_units: list[int] = [80, 40],
|
|
43
|
+
attention_activation: str = "sigmoid",
|
|
44
|
+
use_negsampling: bool = False,
|
|
45
|
+
target: list[str] = [],
|
|
46
|
+
optimizer: str = "adam",
|
|
47
|
+
optimizer_params: dict = {},
|
|
48
|
+
loss: str | nn.Module | None = "bce",
|
|
49
|
+
device: str = "cpu",
|
|
50
|
+
model_id: str = "baseline",
|
|
51
|
+
embedding_l1_reg=1e-6,
|
|
52
|
+
dense_l1_reg=1e-5,
|
|
53
|
+
embedding_l2_reg=1e-5,
|
|
54
|
+
dense_l2_reg=1e-4,
|
|
55
|
+
):
|
|
56
|
+
|
|
48
57
|
super(DIEN, self).__init__(
|
|
49
58
|
dense_features=dense_features,
|
|
50
59
|
sparse_features=sparse_features,
|
|
@@ -57,158 +66,185 @@ class DIEN(BaseModel):
|
|
|
57
66
|
embedding_l2_reg=embedding_l2_reg,
|
|
58
67
|
dense_l2_reg=dense_l2_reg,
|
|
59
68
|
early_stop_patience=20,
|
|
60
|
-
model_id=model_id
|
|
69
|
+
model_id=model_id,
|
|
61
70
|
)
|
|
62
71
|
|
|
63
72
|
self.loss = loss
|
|
64
73
|
if self.loss is None:
|
|
65
74
|
self.loss = "bce"
|
|
66
|
-
|
|
75
|
+
|
|
67
76
|
self.use_negsampling = use_negsampling
|
|
68
|
-
|
|
77
|
+
|
|
69
78
|
# Features classification
|
|
70
79
|
if len(sequence_features) == 0:
|
|
71
|
-
raise ValueError(
|
|
72
|
-
|
|
80
|
+
raise ValueError(
|
|
81
|
+
"DIEN requires at least one sequence feature for user behavior history"
|
|
82
|
+
)
|
|
83
|
+
|
|
73
84
|
self.behavior_feature = sequence_features[0] # User behavior sequence
|
|
74
|
-
self.candidate_feature =
|
|
75
|
-
|
|
76
|
-
|
|
85
|
+
self.candidate_feature = (
|
|
86
|
+
sparse_features[-1] if sparse_features else None
|
|
87
|
+
) # Candidate item
|
|
88
|
+
|
|
89
|
+
self.other_sparse_features = (
|
|
90
|
+
sparse_features[:-1] if self.candidate_feature else sparse_features
|
|
91
|
+
)
|
|
77
92
|
self.dense_features_list = dense_features
|
|
78
|
-
|
|
93
|
+
|
|
79
94
|
# All features for embedding
|
|
80
95
|
self.all_features = dense_features + sparse_features + sequence_features
|
|
81
96
|
|
|
82
97
|
# Embedding layer
|
|
83
98
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
84
|
-
|
|
99
|
+
|
|
85
100
|
behavior_emb_dim = self.behavior_feature.embedding_dim
|
|
86
101
|
self.candidate_proj = None
|
|
87
|
-
if
|
|
88
|
-
self.
|
|
89
|
-
|
|
102
|
+
if (
|
|
103
|
+
self.candidate_feature is not None
|
|
104
|
+
and self.candidate_feature.embedding_dim != gru_hidden_size
|
|
105
|
+
):
|
|
106
|
+
self.candidate_proj = nn.Linear(
|
|
107
|
+
self.candidate_feature.embedding_dim, gru_hidden_size
|
|
108
|
+
)
|
|
109
|
+
|
|
90
110
|
# Interest Extractor Layer (GRU)
|
|
91
111
|
self.interest_extractor = DynamicGRU(
|
|
92
|
-
input_size=behavior_emb_dim,
|
|
93
|
-
hidden_size=gru_hidden_size
|
|
112
|
+
input_size=behavior_emb_dim, hidden_size=gru_hidden_size
|
|
94
113
|
)
|
|
95
|
-
|
|
114
|
+
|
|
96
115
|
# Attention layer for computing attention scores
|
|
97
116
|
self.attention_layer = AttentionPoolingLayer(
|
|
98
117
|
embedding_dim=gru_hidden_size,
|
|
99
118
|
hidden_units=attention_hidden_units,
|
|
100
119
|
activation=attention_activation,
|
|
101
|
-
use_softmax=False # We'll use scores directly for AUGRU
|
|
120
|
+
use_softmax=False, # We'll use scores directly for AUGRU
|
|
102
121
|
)
|
|
103
|
-
|
|
122
|
+
|
|
104
123
|
# Interest Evolution Layer (AUGRU)
|
|
105
124
|
self.interest_evolution = AUGRU(
|
|
106
|
-
input_size=gru_hidden_size,
|
|
107
|
-
hidden_size=gru_hidden_size
|
|
125
|
+
input_size=gru_hidden_size, hidden_size=gru_hidden_size
|
|
108
126
|
)
|
|
109
|
-
|
|
127
|
+
|
|
110
128
|
# Calculate MLP input dimension
|
|
111
129
|
mlp_input_dim = 0
|
|
112
130
|
if self.candidate_feature:
|
|
113
131
|
mlp_input_dim += self.candidate_feature.embedding_dim
|
|
114
132
|
mlp_input_dim += gru_hidden_size # final interest state
|
|
115
133
|
mlp_input_dim += sum([f.embedding_dim for f in self.other_sparse_features])
|
|
116
|
-
mlp_input_dim += sum(
|
|
117
|
-
|
|
134
|
+
mlp_input_dim += sum(
|
|
135
|
+
[getattr(f, "embedding_dim", 1) or 1 for f in dense_features]
|
|
136
|
+
)
|
|
137
|
+
|
|
118
138
|
# MLP for final prediction
|
|
119
139
|
self.mlp = MLP(input_dim=mlp_input_dim, **mlp_params)
|
|
120
140
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
121
141
|
|
|
122
142
|
# Register regularization weights
|
|
123
143
|
self._register_regularization_weights(
|
|
124
|
-
embedding_attr=
|
|
125
|
-
include_modules=[
|
|
144
|
+
embedding_attr="embedding",
|
|
145
|
+
include_modules=[
|
|
146
|
+
"interest_extractor",
|
|
147
|
+
"interest_evolution",
|
|
148
|
+
"attention_layer",
|
|
149
|
+
"mlp",
|
|
150
|
+
"candidate_proj",
|
|
151
|
+
],
|
|
126
152
|
)
|
|
127
153
|
|
|
128
|
-
self.compile(
|
|
129
|
-
optimizer=optimizer,
|
|
130
|
-
optimizer_params=optimizer_params,
|
|
131
|
-
loss=loss
|
|
132
|
-
)
|
|
154
|
+
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss)
|
|
133
155
|
|
|
134
156
|
def forward(self, x):
|
|
135
157
|
# Get candidate item embedding
|
|
136
158
|
if self.candidate_feature:
|
|
137
|
-
candidate_emb = self.embedding.embed_dict[
|
|
159
|
+
candidate_emb = self.embedding.embed_dict[
|
|
160
|
+
self.candidate_feature.embedding_name
|
|
161
|
+
](
|
|
138
162
|
x[self.candidate_feature.name].long()
|
|
139
163
|
) # [B, emb_dim]
|
|
140
164
|
else:
|
|
141
165
|
raise ValueError("DIEN requires a candidate item feature")
|
|
142
|
-
|
|
166
|
+
|
|
143
167
|
# Get behavior sequence embedding
|
|
144
168
|
behavior_seq = x[self.behavior_feature.name].long() # [B, seq_len]
|
|
145
169
|
behavior_emb = self.embedding.embed_dict[self.behavior_feature.embedding_name](
|
|
146
170
|
behavior_seq
|
|
147
171
|
) # [B, seq_len, emb_dim]
|
|
148
|
-
|
|
172
|
+
|
|
149
173
|
# Create mask for padding
|
|
150
174
|
if self.behavior_feature.padding_idx is not None:
|
|
151
|
-
mask = (
|
|
175
|
+
mask = (
|
|
176
|
+
(behavior_seq != self.behavior_feature.padding_idx)
|
|
177
|
+
.unsqueeze(-1)
|
|
178
|
+
.float()
|
|
179
|
+
)
|
|
152
180
|
else:
|
|
153
181
|
mask = (behavior_seq != 0).unsqueeze(-1).float()
|
|
154
|
-
|
|
182
|
+
|
|
155
183
|
# Step 1: Interest Extractor (GRU)
|
|
156
|
-
interest_states, _ = self.interest_extractor(
|
|
157
|
-
|
|
184
|
+
interest_states, _ = self.interest_extractor(
|
|
185
|
+
behavior_emb
|
|
186
|
+
) # [B, seq_len, hidden_size]
|
|
187
|
+
|
|
158
188
|
# Step 2: Compute attention scores for each time step
|
|
159
189
|
batch_size, seq_len, hidden_size = interest_states.shape
|
|
160
|
-
|
|
190
|
+
|
|
161
191
|
# Project candidate to hidden_size if necessary (defined in __init__)
|
|
162
192
|
if self.candidate_proj is not None:
|
|
163
193
|
candidate_for_attention = self.candidate_proj(candidate_emb)
|
|
164
194
|
else:
|
|
165
195
|
candidate_for_attention = candidate_emb
|
|
166
|
-
|
|
196
|
+
|
|
167
197
|
# Compute attention scores for AUGRU
|
|
168
198
|
attention_scores = []
|
|
169
199
|
for t in range(seq_len):
|
|
170
200
|
score = self.attention_layer.attention_net(
|
|
171
|
-
torch.cat(
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
201
|
+
torch.cat(
|
|
202
|
+
[
|
|
203
|
+
candidate_for_attention,
|
|
204
|
+
interest_states[:, t, :],
|
|
205
|
+
candidate_for_attention - interest_states[:, t, :],
|
|
206
|
+
candidate_for_attention * interest_states[:, t, :],
|
|
207
|
+
],
|
|
208
|
+
dim=-1,
|
|
209
|
+
)
|
|
177
210
|
) # [B, 1]
|
|
178
211
|
attention_scores.append(score)
|
|
179
|
-
|
|
180
|
-
attention_scores = torch.cat(attention_scores, dim=1).unsqueeze(
|
|
212
|
+
|
|
213
|
+
attention_scores = torch.cat(attention_scores, dim=1).unsqueeze(
|
|
214
|
+
-1
|
|
215
|
+
) # [B, seq_len, 1]
|
|
181
216
|
attention_scores = torch.sigmoid(attention_scores) # Normalize to [0, 1]
|
|
182
|
-
|
|
217
|
+
|
|
183
218
|
# Apply mask to attention scores
|
|
184
219
|
attention_scores = attention_scores * mask
|
|
185
|
-
|
|
220
|
+
|
|
186
221
|
# Step 3: Interest Evolution (AUGRU)
|
|
187
222
|
final_states, final_interest = self.interest_evolution(
|
|
188
|
-
interest_states,
|
|
189
|
-
attention_scores
|
|
223
|
+
interest_states, attention_scores
|
|
190
224
|
) # final_interest: [B, hidden_size]
|
|
191
|
-
|
|
225
|
+
|
|
192
226
|
# Get other features
|
|
193
227
|
other_embeddings = []
|
|
194
228
|
other_embeddings.append(candidate_emb)
|
|
195
229
|
other_embeddings.append(final_interest)
|
|
196
|
-
|
|
230
|
+
|
|
197
231
|
# Other sparse features
|
|
198
232
|
for feat in self.other_sparse_features:
|
|
199
|
-
feat_emb = self.embedding.embed_dict[feat.embedding_name](
|
|
233
|
+
feat_emb = self.embedding.embed_dict[feat.embedding_name](
|
|
234
|
+
x[feat.name].long()
|
|
235
|
+
)
|
|
200
236
|
other_embeddings.append(feat_emb)
|
|
201
|
-
|
|
237
|
+
|
|
202
238
|
# Dense features
|
|
203
239
|
for feat in self.dense_features_list:
|
|
204
240
|
val = x[feat.name].float()
|
|
205
241
|
if val.dim() == 1:
|
|
206
242
|
val = val.unsqueeze(1)
|
|
207
243
|
other_embeddings.append(val)
|
|
208
|
-
|
|
244
|
+
|
|
209
245
|
# Concatenate all features
|
|
210
246
|
concat_input = torch.cat(other_embeddings, dim=-1) # [B, total_dim]
|
|
211
|
-
|
|
247
|
+
|
|
212
248
|
# MLP prediction
|
|
213
249
|
y = self.mlp(concat_input) # [B, 1]
|
|
214
250
|
return self.prediction_layer(y)
|
nextrec/models/ranking/din.py
CHANGED
|
@@ -12,7 +12,12 @@ import torch
|
|
|
12
12
|
import torch.nn as nn
|
|
13
13
|
|
|
14
14
|
from nextrec.basic.model import BaseModel
|
|
15
|
-
from nextrec.basic.layers import
|
|
15
|
+
from nextrec.basic.layers import (
|
|
16
|
+
EmbeddingLayer,
|
|
17
|
+
MLP,
|
|
18
|
+
AttentionPoolingLayer,
|
|
19
|
+
PredictionLayer,
|
|
20
|
+
)
|
|
16
21
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
17
22
|
|
|
18
23
|
|
|
@@ -24,26 +29,28 @@ class DIN(BaseModel):
|
|
|
24
29
|
@property
|
|
25
30
|
def task_type(self):
|
|
26
31
|
return "binary"
|
|
27
|
-
|
|
28
|
-
def __init__(
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
dense_features: list[DenseFeature],
|
|
36
|
+
sparse_features: list[SparseFeature],
|
|
37
|
+
sequence_features: list[SequenceFeature],
|
|
38
|
+
mlp_params: dict,
|
|
39
|
+
attention_hidden_units: list[int] = [80, 40],
|
|
40
|
+
attention_activation: str = "sigmoid",
|
|
41
|
+
attention_use_softmax: bool = True,
|
|
42
|
+
target: list[str] = [],
|
|
43
|
+
optimizer: str = "adam",
|
|
44
|
+
optimizer_params: dict = {},
|
|
45
|
+
loss: str | nn.Module | None = "bce",
|
|
46
|
+
device: str = "cpu",
|
|
47
|
+
model_id: str = "baseline",
|
|
48
|
+
embedding_l1_reg=1e-6,
|
|
49
|
+
dense_l1_reg=1e-5,
|
|
50
|
+
embedding_l2_reg=1e-5,
|
|
51
|
+
dense_l2_reg=1e-4,
|
|
52
|
+
):
|
|
53
|
+
|
|
47
54
|
super(DIN, self).__init__(
|
|
48
55
|
dense_features=dense_features,
|
|
49
56
|
sparse_features=sparse_features,
|
|
@@ -56,43 +63,54 @@ class DIN(BaseModel):
|
|
|
56
63
|
embedding_l2_reg=embedding_l2_reg,
|
|
57
64
|
dense_l2_reg=dense_l2_reg,
|
|
58
65
|
early_stop_patience=20,
|
|
59
|
-
model_id=model_id
|
|
66
|
+
model_id=model_id,
|
|
60
67
|
)
|
|
61
68
|
|
|
62
69
|
self.loss = loss
|
|
63
70
|
if self.loss is None:
|
|
64
71
|
self.loss = "bce"
|
|
65
|
-
|
|
72
|
+
|
|
66
73
|
# Features classification
|
|
67
74
|
# DIN requires: candidate item + user behavior sequence + other features
|
|
68
75
|
if len(sequence_features) == 0:
|
|
69
|
-
raise ValueError(
|
|
70
|
-
|
|
76
|
+
raise ValueError(
|
|
77
|
+
"DIN requires at least one sequence feature for user behavior history"
|
|
78
|
+
)
|
|
79
|
+
|
|
71
80
|
self.behavior_feature = sequence_features[0] # User behavior sequence
|
|
72
|
-
self.candidate_feature =
|
|
73
|
-
|
|
81
|
+
self.candidate_feature = (
|
|
82
|
+
sparse_features[-1] if sparse_features else None
|
|
83
|
+
) # Candidate item
|
|
84
|
+
|
|
74
85
|
# Other features (excluding behavior sequence in final concatenation)
|
|
75
|
-
self.other_sparse_features =
|
|
86
|
+
self.other_sparse_features = (
|
|
87
|
+
sparse_features[:-1] if self.candidate_feature else sparse_features
|
|
88
|
+
)
|
|
76
89
|
self.dense_features_list = dense_features
|
|
77
|
-
|
|
90
|
+
|
|
78
91
|
# All features for embedding
|
|
79
92
|
self.all_features = dense_features + sparse_features + sequence_features
|
|
80
93
|
|
|
81
94
|
# Embedding layer
|
|
82
95
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
83
|
-
|
|
96
|
+
|
|
84
97
|
# Attention layer for behavior sequence
|
|
85
98
|
behavior_emb_dim = self.behavior_feature.embedding_dim
|
|
86
99
|
self.candidate_attention_proj = None
|
|
87
|
-
if
|
|
88
|
-
self.
|
|
100
|
+
if (
|
|
101
|
+
self.candidate_feature is not None
|
|
102
|
+
and self.candidate_feature.embedding_dim != behavior_emb_dim
|
|
103
|
+
):
|
|
104
|
+
self.candidate_attention_proj = nn.Linear(
|
|
105
|
+
self.candidate_feature.embedding_dim, behavior_emb_dim
|
|
106
|
+
)
|
|
89
107
|
self.attention = AttentionPoolingLayer(
|
|
90
108
|
embedding_dim=behavior_emb_dim,
|
|
91
109
|
hidden_units=attention_hidden_units,
|
|
92
110
|
activation=attention_activation,
|
|
93
|
-
use_softmax=attention_use_softmax
|
|
111
|
+
use_softmax=attention_use_softmax,
|
|
94
112
|
)
|
|
95
|
-
|
|
113
|
+
|
|
96
114
|
# Calculate MLP input dimension
|
|
97
115
|
# candidate + attention_pooled_behavior + other_sparse + dense
|
|
98
116
|
mlp_input_dim = 0
|
|
@@ -100,82 +118,88 @@ class DIN(BaseModel):
|
|
|
100
118
|
mlp_input_dim += self.candidate_feature.embedding_dim
|
|
101
119
|
mlp_input_dim += behavior_emb_dim # attention pooled
|
|
102
120
|
mlp_input_dim += sum([f.embedding_dim for f in self.other_sparse_features])
|
|
103
|
-
mlp_input_dim += sum(
|
|
104
|
-
|
|
121
|
+
mlp_input_dim += sum(
|
|
122
|
+
[getattr(f, "embedding_dim", 1) or 1 for f in dense_features]
|
|
123
|
+
)
|
|
124
|
+
|
|
105
125
|
# MLP for final prediction
|
|
106
126
|
self.mlp = MLP(input_dim=mlp_input_dim, **mlp_params)
|
|
107
127
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
108
128
|
|
|
109
129
|
# Register regularization weights
|
|
110
130
|
self._register_regularization_weights(
|
|
111
|
-
embedding_attr=
|
|
112
|
-
include_modules=[
|
|
131
|
+
embedding_attr="embedding",
|
|
132
|
+
include_modules=["attention", "mlp", "candidate_attention_proj"],
|
|
113
133
|
)
|
|
114
134
|
|
|
115
|
-
self.compile(
|
|
116
|
-
optimizer=optimizer,
|
|
117
|
-
optimizer_params=optimizer_params,
|
|
118
|
-
loss=loss
|
|
119
|
-
)
|
|
135
|
+
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss)
|
|
120
136
|
|
|
121
137
|
def forward(self, x):
|
|
122
138
|
# Get candidate item embedding
|
|
123
139
|
if self.candidate_feature:
|
|
124
|
-
candidate_emb = self.embedding.embed_dict[
|
|
140
|
+
candidate_emb = self.embedding.embed_dict[
|
|
141
|
+
self.candidate_feature.embedding_name
|
|
142
|
+
](
|
|
125
143
|
x[self.candidate_feature.name].long()
|
|
126
144
|
) # [B, emb_dim]
|
|
127
145
|
else:
|
|
128
146
|
candidate_emb = None
|
|
129
|
-
|
|
147
|
+
|
|
130
148
|
# Get behavior sequence embedding
|
|
131
149
|
behavior_seq = x[self.behavior_feature.name].long() # [B, seq_len]
|
|
132
150
|
behavior_emb = self.embedding.embed_dict[self.behavior_feature.embedding_name](
|
|
133
151
|
behavior_seq
|
|
134
152
|
) # [B, seq_len, emb_dim]
|
|
135
|
-
|
|
153
|
+
|
|
136
154
|
# Create mask for padding
|
|
137
155
|
if self.behavior_feature.padding_idx is not None:
|
|
138
|
-
mask = (
|
|
156
|
+
mask = (
|
|
157
|
+
(behavior_seq != self.behavior_feature.padding_idx)
|
|
158
|
+
.unsqueeze(-1)
|
|
159
|
+
.float()
|
|
160
|
+
)
|
|
139
161
|
else:
|
|
140
162
|
mask = (behavior_seq != 0).unsqueeze(-1).float()
|
|
141
|
-
|
|
163
|
+
|
|
142
164
|
# Apply attention pooling
|
|
143
165
|
if candidate_emb is not None:
|
|
144
166
|
candidate_query = candidate_emb
|
|
145
167
|
if self.candidate_attention_proj is not None:
|
|
146
168
|
candidate_query = self.candidate_attention_proj(candidate_query)
|
|
147
169
|
pooled_behavior = self.attention(
|
|
148
|
-
query=candidate_query,
|
|
149
|
-
keys=behavior_emb,
|
|
150
|
-
mask=mask
|
|
170
|
+
query=candidate_query, keys=behavior_emb, mask=mask
|
|
151
171
|
) # [B, emb_dim]
|
|
152
172
|
else:
|
|
153
173
|
# If no candidate, use mean pooling
|
|
154
|
-
pooled_behavior = torch.sum(behavior_emb * mask, dim=1) / (
|
|
155
|
-
|
|
174
|
+
pooled_behavior = torch.sum(behavior_emb * mask, dim=1) / (
|
|
175
|
+
mask.sum(dim=1) + 1e-9
|
|
176
|
+
)
|
|
177
|
+
|
|
156
178
|
# Get other features
|
|
157
179
|
other_embeddings = []
|
|
158
|
-
|
|
180
|
+
|
|
159
181
|
if candidate_emb is not None:
|
|
160
182
|
other_embeddings.append(candidate_emb)
|
|
161
|
-
|
|
183
|
+
|
|
162
184
|
other_embeddings.append(pooled_behavior)
|
|
163
|
-
|
|
185
|
+
|
|
164
186
|
# Other sparse features
|
|
165
187
|
for feat in self.other_sparse_features:
|
|
166
|
-
feat_emb = self.embedding.embed_dict[feat.embedding_name](
|
|
188
|
+
feat_emb = self.embedding.embed_dict[feat.embedding_name](
|
|
189
|
+
x[feat.name].long()
|
|
190
|
+
)
|
|
167
191
|
other_embeddings.append(feat_emb)
|
|
168
|
-
|
|
192
|
+
|
|
169
193
|
# Dense features
|
|
170
194
|
for feat in self.dense_features_list:
|
|
171
195
|
val = x[feat.name].float()
|
|
172
196
|
if val.dim() == 1:
|
|
173
197
|
val = val.unsqueeze(1)
|
|
174
198
|
other_embeddings.append(val)
|
|
175
|
-
|
|
199
|
+
|
|
176
200
|
# Concatenate all features
|
|
177
201
|
concat_input = torch.cat(other_embeddings, dim=-1) # [B, total_dim]
|
|
178
|
-
|
|
202
|
+
|
|
179
203
|
# MLP prediction
|
|
180
204
|
y = self.mlp(concat_input) # [B, 1]
|
|
181
205
|
return self.prediction_layer(y)
|